--- title: Criteo Dataset Transformation keywords: fastai sidebar: home_sidebar summary: "Implementation of transformation functions specific to criteo ad-display dataset." description: "Implementation of transformation functions specific to criteo ad-display dataset." nb_path: "nbs/transforms/datasets/criteo.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %} {% raw %}

sparseFeature[source]

sparseFeature(feat, feat_num, embed_dim=4)

create dictionary for sparse feature :param feat: feature name :param feat_num: the total number of sparse features that do not repeat :param embed_dim: embedding dimension :return:

{% endraw %} {% raw %}

denseFeature[source]

denseFeature(feat)

create dictionary for dense feature :param feat: dense feature name :return:

{% endraw %} {% raw %}
{% endraw %} {% raw %}

create_criteo_dataset[source]

create_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2)

a example about creating criteo dataset :param file: dataset's path :param embed_dim: the embedding dimension of sparse features :param read_part: whether to read part of it :param sample_num: the number of instances if read_part is True :param test_size: ratio of test dataset :return: feature columns, train, test

{% endraw %} {% raw %}
{% endraw %} {% raw %}
# !pip install --upgrade --force-reinstall --no-deps kaggle
# !mkdir ~/.kaggle
# !cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets download -d mrkmakr/criteo-dataset
# !unzip criteo-dataset.zip
Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
     |████████████████████████████████| 58 kB 2.7 MB/s 
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... done
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=a09d2576937c68b6341e6bce9eeefa020563e125d97e69548f4d591568008b5f
  Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    Uninstalling kaggle-1.5.12:
      Successfully uninstalled kaggle-1.5.12
Successfully installed kaggle-1.5.12
Downloading criteo-dataset.zip to /content
100% 4.31G/4.31G [01:20<00:00, 58.4MB/s]
100% 4.31G/4.31G [01:20<00:00, 57.4MB/s]
Archive:  criteo-dataset.zip
  inflating: dac/readme.txt          
  inflating: dac/test.txt            
  inflating: dac/train.txt           
{% endraw %} {% raw %}
file = 'dac/train.txt'
read_part = True
sample_num = 10000
test_size = 0.2

feature_columns, train, test = create_criteo_dataset(file=file,
                                        read_part=read_part,
                                        sample_num=sample_num,
                                        test_size=test_size)
{% endraw %} {% raw %}
feature_columns
[{'embed_dim': 8, 'feat_name': 'C1', 'feat_num': 175},
 {'embed_dim': 8, 'feat_name': 'C2', 'feat_num': 386},
 {'embed_dim': 8, 'feat_name': 'C3', 'feat_num': 5521},
 {'embed_dim': 8, 'feat_name': 'C4', 'feat_num': 4033},
 {'embed_dim': 8, 'feat_name': 'C5', 'feat_num': 56},
 {'embed_dim': 8, 'feat_name': 'C6', 'feat_num': 8},
 {'embed_dim': 8, 'feat_name': 'C7', 'feat_num': 3184},
 {'embed_dim': 8, 'feat_name': 'C8', 'feat_num': 93},
 {'embed_dim': 8, 'feat_name': 'C9', 'feat_num': 3},
 {'embed_dim': 8, 'feat_name': 'C10', 'feat_num': 2986},
 {'embed_dim': 8, 'feat_name': 'C11', 'feat_num': 2084},
 {'embed_dim': 8, 'feat_name': 'C12', 'feat_num': 5284},
 {'embed_dim': 8, 'feat_name': 'C13', 'feat_num': 1725},
 {'embed_dim': 8, 'feat_name': 'C14', 'feat_num': 24},
 {'embed_dim': 8, 'feat_name': 'C15', 'feat_num': 2035},
 {'embed_dim': 8, 'feat_name': 'C16', 'feat_num': 4724},
 {'embed_dim': 8, 'feat_name': 'C17', 'feat_num': 9},
 {'embed_dim': 8, 'feat_name': 'C18', 'feat_num': 1149},
 {'embed_dim': 8, 'feat_name': 'C19', 'feat_num': 547},
 {'embed_dim': 8, 'feat_name': 'C20', 'feat_num': 4},
 {'embed_dim': 8, 'feat_name': 'C21', 'feat_num': 5037},
 {'embed_dim': 8, 'feat_name': 'C22', 'feat_num': 8},
 {'embed_dim': 8, 'feat_name': 'C23', 'feat_num': 12},
 {'embed_dim': 8, 'feat_name': 'C24', 'feat_num': 2525},
 {'embed_dim': 8, 'feat_name': 'C25', 'feat_num': 40},
 {'embed_dim': 8, 'feat_name': 'C26', 'feat_num': 1939},
 {'embed_dim': 8, 'feat_name': 'I1', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I2', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I3', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I4', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I5', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I6', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I7', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I8', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I9', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I10', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I11', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I12', 'feat_num': 100},
 {'embed_dim': 8, 'feat_name': 'I13', 'feat_num': 100}]
{% endraw %} {% raw %}
train
(array([[   1,  293, 2491, ...,    0,    0,    1],
        [   1,   88,    0, ...,    1,    0,    1],
        [   1,   17, 5197, ...,    1,    0,    0],
        ...,
        [   1,  355, 4284, ...,    3,    0,    0],
        [   1,  192,   56, ...,    1,    0,    0],
        [  75,   18, 2613, ...,    3,    0,    0]], dtype=int32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int32))
{% endraw %} {% raw %}
test
(array([[ 111,  105,  695, ...,    3,    0,    0],
        [ 102,  337, 2613, ...,    0,    0,    1],
        [  75,  301,  155, ...,    1,    0,    0],
        ...,
        [  75,   86,  507, ...,    1,    1,    1],
        [   1,  347, 2205, ...,    2,    1,    1],
        [ 102,  125,    5, ...,    1,    1,    0]], dtype=int32),
 array([1, 0, 1, ..., 0, 0, 0], dtype=int32))
{% endraw %}