--- title: Splitting keywords: fastai sidebar: home_sidebar summary: "Data Splitting Transforms." description: "Data Splitting Transforms." nb_path: "nbs/transforms/splitting.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %}

Split by Ratio

{% raw %}

split_by_ratio[source]

split_by_ratio(data, shuffle=False, test_size=None, pad_unknown=True, filter_unknown=False, seed=42)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

df = pd.DataFrame.from_dict(
    {
        'user':[1,1,1,1,1,2,2,3],
        'item':[1,2,3,2,2,1,2,3]
    }
)

df
user item
0 1 1
1 1 2
2 1 3
3 1 2
4 1 2
5 2 1
6 2 2
7 3 3
{% endraw %} {% raw %}
train, test = split_by_ratio(df, shuffle=False, test_size=0.2, pad_unknown=True, filter_unknown=False)
print("train:\n{}\n\ntest:\n{}".format(train,test))
train:
   user  item
0     1     1
1     1     2
2     1     3
3     1     2
5     2     1
6     2     2
7     3     3

test:
   user  item
4     1     2
{% endraw %} {% raw %}
train, test = split_by_ratio(df, shuffle=False, test_size=0.4, pad_unknown=True, filter_unknown=True)
print("train:\n{}\n\ntest:\n{}".format(train,test))
train:
   user  item
0     1     1
1     1     2
2     1     3
5     2     1
6     2     2
7     3     3

test:
   user  item
3     1     2
4     1     2
{% endraw %}

Last-session-out split

{% raw %}

last_session_out_split[source]

last_session_out_split(data, user_key='user_id', session_key='session_id', time_key='ts')

Assign the last session of every user to the test set and the remaining ones to the training set

{% endraw %} {% raw %}
{% endraw %} {% raw %}
import pandas as pd

df = pd.DataFrame.from_dict({
    'session_id': [357,359,394,4127,6400],
    'sequence': [[793, 3489],[1762],[1256],
                 [1948, 1364, 2060, 1115, 6488, 2060],
                 [687, 1394]],
    'ts': [1421003874, 1421018535, 1421007470,
           1421416896, 1420807778],
    'user_id': [4296, 4296, 30980, 28117, 35247]
})

df		
session_id sequence ts user_id
0 357 [793, 3489] 1421003874 4296
1 359 [1762] 1421018535 4296
2 394 [1256] 1421007470 30980
3 4127 [1948, 1364, 2060, 1115, 6488, 2060] 1421416896 28117
4 6400 [687, 1394] 1420807778 35247
{% endraw %} {% raw %}
train_data, test_data = last_session_out_split(df)
train_data
session_id sequence ts user_id
0 357 [793, 3489] 1421003874 4296
{% endraw %}