--- title: Title keywords: fastai sidebar: home_sidebar ---
{% raw %}
{% endraw %} {% raw %}
%reload_ext autoreload
%autoreload 2
%matplotlib inline
{% endraw %} {% raw %}
from fastai.vision import *
from fastai.tabular import *
from image_tabular.core import *
from image_tabular.dataset import *
from image_tabular.model import *
from image_tabular.metric import *

# use gpu by default if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
{% endraw %} {% raw %}
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.functional")
{% endraw %} {% raw %}
data_path = Path("./data/siim-isic-melanoma-classification/")
{% endraw %} {% raw %}
train_df = pd.read_csv(data_path/"train.csv")
test_df = pd.read_csv(data_path/"test.csv")

print(len(train_df), len(test_df))
33126 10982
{% endraw %} {% raw %}
train_df.head()
image_name patient_id sex age_approx anatom_site_general_challenge diagnosis benign_malignant target
0 ISIC_2637011 IP_7279968 male 45.0 head/neck unknown benign 0
1 ISIC_0015719 IP_3075186 female 45.0 upper extremity unknown benign 0
2 ISIC_0052212 IP_2842074 female 50.0 lower extremity nevus benign 0
3 ISIC_0068279 IP_6890425 female 45.0 head/neck unknown benign 0
4 ISIC_0074268 IP_8723313 female 55.0 upper extremity unknown benign 0
{% endraw %} {% raw %}
# extremely unbalanced dataset, most of the images are benign
train_df["target"].value_counts(normalize=True)
0    0.98237
1    0.01763
Name: target, dtype: float64
{% endraw %}

Image data

{% raw %}
tfms = get_transforms(flip_vert=True)
size = 128
{% endraw %} {% raw %}
# idx for validation, shared by image and tabular data
val_idx = get_valid_index(train_df)
len(val_idx)
6625
{% endraw %} {% raw %}
# load image data using train_df and prepare fastai LabelLists
image_data = (ImageList.from_df(train_df, path=data_path, cols="image_name",
                               folder="train_128", suffix=".jpg")
              .split_by_idx(val_idx)
              .label_from_df(cols="target")
              .transform(tfms, size=size))

# add test data so that we can make predictions
test_image_data = ImageList.from_df(test_df, path=data_path, cols="image_name",
                                    folder="test_128", suffix=".jpg")
image_data.add_test(test_image_data)
LabelLists;

Train: LabelList (26501 items)
x: ImageList
Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128)
y: CategoryList
0,0,0,0,0
Path: data/siim-isic-melanoma-classification;

Valid: LabelList (6625 items)
x: ImageList
Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128)
y: CategoryList
0,0,0,0,0
Path: data/siim-isic-melanoma-classification;

Test: LabelList (10982 items)
x: ImageList
Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128),Image (3, 128, 128)
y: EmptyLabelList
,,,,
Path: data/siim-isic-melanoma-classification
{% endraw %} {% raw %}
# show one example image
print(image_data.train[0][1])
image_data.train[0][0]
0
{% endraw %}

Tabular data

{% raw %}
dep_var = 'target'
cat_names = ['sex', 'anatom_site_general_challenge']
cont_names = ['age_approx']
procs = [FillMissing, Categorify, Normalize]
{% endraw %} {% raw %}
tab_data = (TabularList.from_df(train_df, path=data_path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(val_idx)
                           .label_from_df(cols=dep_var))

# add test
tab_data.add_test(TabularList.from_df(test_df, cat_names=cat_names, cont_names=cont_names,
                                      processor = tab_data.train.x.processor))
LabelLists;

Train: LabelList (26501 items)
x: TabularList
sex male; anatom_site_general_challenge head/neck; age_approx_na False; age_approx -0.2713; ,sex female; anatom_site_general_challenge upper extremity; age_approx_na False; age_approx -0.2713; ,sex female; anatom_site_general_challenge lower extremity; age_approx_na False; age_approx 0.0768; ,sex female; anatom_site_general_challenge head/neck; age_approx_na False; age_approx -0.2713; ,sex female; anatom_site_general_challenge lower extremity; age_approx_na False; age_approx -0.6195; 
y: CategoryList
0,0,0,0,0
Path: data/siim-isic-melanoma-classification;

Valid: LabelList (6625 items)
x: TabularList
sex female; anatom_site_general_challenge upper extremity; age_approx_na False; age_approx -0.2713; ,sex female; anatom_site_general_challenge upper extremity; age_approx_na False; age_approx -0.2713; ,sex male; anatom_site_general_challenge upper extremity; age_approx_na False; age_approx -0.2713; ,sex male; anatom_site_general_challenge upper extremity; age_approx_na False; age_approx 1.1214; ,sex male; anatom_site_general_challenge torso; age_approx_na False; age_approx -1.3159; 
y: CategoryList
0,0,0,0,0
Path: data/siim-isic-melanoma-classification;

Test: LabelList (10982 items)
x: TabularList
sex male; anatom_site_general_challenge #na#; age_approx_na False; age_approx 1.4696; ,sex male; anatom_site_general_challenge lower extremity; age_approx_na False; age_approx -0.6195; ,sex female; anatom_site_general_challenge torso; age_approx_na False; age_approx 0.4250; ,sex female; anatom_site_general_challenge torso; age_approx_na False; age_approx 0.0768; ,sex female; anatom_site_general_challenge lower extremity; age_approx_na False; age_approx -0.2713; 
y: EmptyLabelList
,,,,
Path: data/siim-isic-melanoma-classification
{% endraw %} {% raw %}
# one example
tab_data.train[0]
(TabularLine [tensor([2, 1, 1]), tensor([-0.2713])], Category 0)
{% endraw %}

Integrate image and tabular data

{% raw %}
integrate_train, integrate_valid, integrate_test = get_imagetabdatasets(image_data, tab_data)
{% endraw %} {% raw %}
# package train, valid, and test datasets into a fastai databunch
bs = 64

db = DataBunch.create(integrate_train, integrate_valid, integrate_test,
                      path=data_path, bs=bs)
db
DataBunch;

Train: <image_tabular.dataset.ImageTabDataset object at 0x7f68646769e8>;

Valid: <image_tabular.dataset.ImageTabDataset object at 0x7f6864676b00>;

Test: <image_tabular.dataset.ImageTabDataset object at 0x7f6864676b38>
{% endraw %} {% raw %}
# image normalization with imagenet_stats
db.norm, db.denorm = normalize_funcs_image_tab(*imagenet_stats)
db.add_tfm(db.norm)
{% endraw %} {% raw %}
# check the shape of one batch
x, y = next(iter(db.train_dl))
len(x)
2
{% endraw %} {% raw %}
# images
x[0].shape
torch.Size([64, 3, 128, 128])
{% endraw %} {% raw %}
# categorical and continuous tabular data 
x[1][0].shape, x[1][1].shape
(torch.Size([64, 3]), torch.Size([64, 1]))
{% endraw %} {% raw %}
# targets
y.shape
torch.Size([64])
{% endraw %}

Model that trains on image and tabular data simultaneously

{% raw %}
# cnn model for images, use Resnet50 as an example
cnn_arch = models.resnet50

# cnn_out_sz is the output size of the cnn model that will be concatenated with tabular model output
cnn_out_sz = 256

# use fastai functions to get a cnn model
image_data_db = image_data.databunch()
image_data_db.c = cnn_out_sz
cnn_learn = cnn_learner(image_data_db, cnn_arch, ps=0.2)
cnn_model = cnn_learn.model
{% endraw %} {% raw %}
# get embedding sizes of categorical data
emb_szs = tab_data.train.get_emb_szs()

# output size of the tabular model that will be concatenated with cnn model output
tab_out_sz = 8

# use fastai functions to get a tabular model
tabular_model = TabularModel(emb_szs, len(cont_names), out_sz=tab_out_sz, layers=[8], ps=0.2)
tabular_model
TabularModel(
  (embeds): ModuleList(
    (0): Embedding(3, 3)
    (1): Embedding(7, 5)
    (2): Embedding(3, 3)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=8, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=8, out_features=8, bias=True)
  )
)
{% endraw %} {% raw %}
# get an integrated model that combines the two components and concatenate their outputs
# which will pass through additional fully connected layers
integrate_model = CNNTabularModel(cnn_model,
                                  tabular_model,
                                  layers = [cnn_out_sz + tab_out_sz, 32],
                                  ps=0.2,
                                  out_sz=2).to(device)
{% endraw %} {% raw %}
# check model output dimension, should be (bs, 2)
integrate_model(*x).shape
torch.Size([64, 2])
{% endraw %} {% raw %}
# adjust loss function weight because the dataset is extremely unbalanced
weights = [1/(1-train_df["target"].mean()), 1/train_df["target"].mean()]
loss_func = CrossEntropyFlat(weight=torch.FloatTensor(weights).to(device))
{% endraw %} {% raw %}
# package everything in a fastai learner, add auc roc score as a metric
learn = Learner(db, integrate_model, metrics=[accuracy, ROCAUC()], loss_func=loss_func)
{% endraw %} {% raw %}
# organize layer groups in order to use differential learning rates provided by fastai
# the first two layer groups are earlier layers of resnet
# the last layer group consists of the fully connected layers of cnn model, tabular model,
# and final fully connected layers for the concatenated data
learn.layer_groups = [nn.Sequential(*flatten_model(cnn_learn.layer_groups[0])),
                      nn.Sequential(*flatten_model(cnn_learn.layer_groups[1])),
                      nn.Sequential(*(flatten_model(cnn_learn.layer_groups[2]) +
                                      flatten_model(integrate_model.tabular_model) +
                                      flatten_model(integrate_model.layers)))]
{% endraw %}

Training

{% raw %}
# find learning rate to train the last layer group first 
learn.freeze()
learn.lr_find()
learn.recorder.plot()
0.00% [0/1 00:00<00:00]
epoch train_loss valid_loss accuracy rocauc time

22.46% [93/414 00:08<00:28 2.3661]
LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.
{% endraw %} {% raw %}
# train
learn.fit_one_cycle(10, 1e-4)
epoch train_loss valid_loss accuracy rocauc time
0 0.705341 0.606712 0.590943 0.789852 00:34
1 0.624063 0.564126 0.663849 0.838528 00:35
2 0.566842 0.559899 0.656151 0.844186 00:35
3 0.564589 0.549224 0.685132 0.853641 00:35
4 0.554486 0.496429 0.754717 0.871573 00:35
5 0.513213 0.498709 0.757887 0.870145 00:35
6 0.523281 0.499963 0.747774 0.870568 00:35
7 0.506248 0.488594 0.768604 0.875076 00:35
8 0.478553 0.478242 0.781434 0.877582 00:35
9 0.484625 0.480606 0.768302 0.877502 00:35
{% endraw %} {% raw %}
# unfreeze all layer groups to train the entire model using differential learning rates
learn.unfreeze()
learn.fit_one_cycle(5, slice(1e-6, 1e-4))
epoch train_loss valid_loss accuracy rocauc time
0 0.472651 0.488090 0.792906 0.857081 00:49
1 0.491386 0.446133 0.838943 0.882625 00:48
2 0.458774 0.463548 0.821585 0.866160 00:48
3 0.411819 0.449613 0.821283 0.877032 00:48
4 0.450356 0.448963 0.836528 0.874391 00:48
{% endraw %}

The model achieved an ROC AUC score of about 0.87 on the validation set.

Prediction

{% raw %}
# make predictions for the test set
preds, y = learn.get_preds(DatasetType.Test)
{% endraw %} {% raw %}
# submit predictions to kaggle
submit = pd.read_csv(data_path/"sample_submission.csv")
submit["target"] = preds[:, 1]
submit.to_csv(data_path/"image_tab.csv", index=False)
{% endraw %}