Skip to content

Commit

Permalink
update code
Browse files Browse the repository at this point in the history
  • Loading branch information
sunlanchang committed May 8, 2020
1 parent faf3f3a commit 6324858
Showing 1 changed file with 181 additions and 23 deletions.
204 changes: 181 additions & 23 deletions process_data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -205,24 +205,6 @@
"train_user.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(2481135,)"
},
"metadata": {},
"execution_count": 15
}
],
"source": [
"train_ad.creative_id.unique().shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -313,13 +295,189 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"train_tmp_click_log = 'data/train_tmp/click_log.csv'\n",
"# train_click_log_path = 'data/train_preliminary/click_log.csv'\n",
"train_click_log = pd.read_csv(train_click_log_path)"
"# train_tmp_click_log = 'data/train_tmp/click_log.csv'\n",
"test_click_log_path = 'data/test/click_log.csv'\n",
"test_click_log = pd.read_csv(test_click_log_path)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "((91,), (1000000,), (2618159,), (93,))"
},
"metadata": {},
"execution_count": 15
}
],
"source": [
"test_click_log.time.unique().shape, test_click_log.user_id.unique().shape, test_click_log.creative_id.unique().shape, test_click_log.click_times.unique().shape"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "time int64\nuser_id int64\ncreative_id int64\nclick_times int64\ndtype: object"
},
"metadata": {},
"execution_count": 16
}
],
"source": [
"test_click_log.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"test_ad = pd.read_csv('data/train_preliminary/ad.csv')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "creative_id int64\nad_id int64\nproduct_id object\nproduct_category int64\nadvertiser_id int64\nindustry object\ndtype: object"
},
"metadata": {},
"execution_count": 18
}
],
"source": [
"test_ad.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"test_ad.product_id[test_ad.product_id=='\\\\N']='0'\n",
"test_ad.industry[test_ad.industry=='\\\\N']='0'"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"test_ad.product_id = test_ad.product_id.astype(int)\n",
"test_ad.industry = test_ad.industry.astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "creative_id int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\ndtype: object"
},
"metadata": {},
"execution_count": 21
}
],
"source": [
"test_ad.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 0 5 381 78\n1 4 4 0 5 108 202\n2 7 7 0 5 148 297\n3 8 8 0 5 713 213\n4 9 9 0 5 695 213",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>creative_id</th>\n <th>ad_id</th>\n <th>product_id</th>\n <th>product_category</th>\n <th>advertiser_id</th>\n <th>industry</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>5</td>\n <td>381</td>\n <td>78</td>\n </tr>\n <tr>\n <th>1</th>\n <td>4</td>\n <td>4</td>\n <td>0</td>\n <td>5</td>\n <td>108</td>\n <td>202</td>\n </tr>\n <tr>\n <th>2</th>\n <td>7</td>\n <td>7</td>\n <td>0</td>\n <td>5</td>\n <td>148</td>\n <td>297</td>\n </tr>\n <tr>\n <th>3</th>\n <td>8</td>\n <td>8</td>\n <td>0</td>\n <td>5</td>\n <td>713</td>\n <td>213</td>\n </tr>\n <tr>\n <th>4</th>\n <td>9</td>\n <td>9</td>\n <td>0</td>\n <td>5</td>\n <td>695</td>\n <td>213</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 23
}
],
"source": [
"test_ad.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 20 3131989 645764 1 573314.0 58.0 \n1 20 3131989 1027422 1 902764.0 129.0 \n2 20 3131989 1106443 1 970829.0 2171.0 \n3 20 3131989 629802 1 559183.0 0.0 \n4 59 3131989 2839769 1 2441288.0 129.0 \n\n product_category advertiser_id industry \n0 2.0 14689.0 6.0 \n1 2.0 42272.0 6.0 \n2 2.0 37513.0 322.0 \n3 18.0 14678.0 26.0 \n4 2.0 35328.0 6.0 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>time</th>\n <th>user_id</th>\n <th>creative_id</th>\n <th>click_times</th>\n <th>ad_id</th>\n <th>product_id</th>\n <th>product_category</th>\n <th>advertiser_id</th>\n <th>industry</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>20</td>\n <td>3131989</td>\n <td>645764</td>\n <td>1</td>\n <td>573314.0</td>\n <td>58.0</td>\n <td>2.0</td>\n <td>14689.0</td>\n <td>6.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>20</td>\n <td>3131989</td>\n <td>1027422</td>\n <td>1</td>\n <td>902764.0</td>\n <td>129.0</td>\n <td>2.0</td>\n <td>42272.0</td>\n <td>6.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>20</td>\n <td>3131989</td>\n <td>1106443</td>\n <td>1</td>\n <td>970829.0</td>\n <td>2171.0</td>\n <td>2.0</td>\n <td>37513.0</td>\n <td>322.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>20</td>\n <td>3131989</td>\n <td>629802</td>\n <td>1</td>\n <td>559183.0</td>\n <td>0.0</td>\n <td>18.0</td>\n <td>14678.0</td>\n <td>26.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>59</td>\n <td>3131989</td>\n <td>2839769</td>\n <td>1</td>\n <td>2441288.0</td>\n <td>129.0</td>\n <td>2.0</td>\n <td>35328.0</td>\n <td>6.0</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {},
"execution_count": 22
}
],
"source": [
"test_clicklog_ad = pd.merge(test_click_log, test_ad, on='creative_id', how='left')\n",
"test_clicklog_ad.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(2618159,)"
},
"metadata": {},
"execution_count": 30
}
],
"source": [
"test_click_log.creative_id.unique().shape"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "(2481135,)"
},
"metadata": {},
"execution_count": 31
}
],
"source": [
"test_ad.creative_id.unique().shape"
]
}
],
Expand Down

0 comments on commit 6324858

Please sign in to comment.