diff --git a/.vscode/settings.json b/.vscode/settings.json index 14bd17e..4da6104 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,3 @@ { - "python.pythonPath": "/root/anaconda3/envs/python377/bin/python" + "python.pythonPath": "/Users/sunlanchang/anaconda3/envs/dev/bin/python" } \ No newline at end of file diff --git a/process_data.ipynb b/process_data.ipynb index a08ed30..d1f89f2 100644 --- a/process_data.ipynb +++ b/process_data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,64 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# train_tmp_click_log = 'data/train_tmp/click_log.csv'\n", + "test_click_log_path = 'data/test/click_log.csv'\n", + "test_click_log = pd.read_csv(test_click_log_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "((30082771, 4), (33585512, 4))" + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "train_click_log.shape,test_click_log.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "frame = [train_click_log, test_click_log]\n", + "click_log_train_and_test = pd.concat(frame, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(63668283, 4)" + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "click_log_train_and_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -40,7 +97,7 @@ "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timeuser_idcreative_idclick_times
09309205673301
1653092030722551
2563092023613271
\n
" }, "metadata": {}, - "execution_count": 158 + "execution_count": 3 } ], "source": [ @@ -49,25 +106,26 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 6, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { - "text/plain": "time int64\nuser_id int64\ncreative_id int64\nclick_times int64\ndtype: object" + "text/plain": " time user_id creative_id click_times\n0 20 3131989 645764 1\n1 20 3131989 1027422 1\n2 20 3131989 1106443 1", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timeuser_idcreative_idclick_times
02031319896457641
120313198910274221
220313198911064431
\n
" }, "metadata": {}, - "execution_count": 159 + "execution_count": 6 } ], "source": [ - "train_click_log.dtypes" + "test_click_log.head(3)" ] }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -76,79 +134,215 @@ "text/plain": "((91,), (900000,), (2481135,), (41,))" }, "metadata": {}, - "execution_count": 160 + "execution_count": 5 } ], "source": [ "train_click_log.time.unique().shape, train_click_log.user_id.unique().shape, train_click_log.creative_id.unique().shape, train_click_log.click_times.unique().shape" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 处理ad" + ] + }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "train_ad = pd.read_csv('data/train_preliminary/ad.csv')" + "train_ad = pd.read_csv('data/train_preliminary/ad.csv')\n", + "train_ad.product_id[train_ad.product_id=='\\\\N']='0'\n", + "train_ad.industry[train_ad.industry=='\\\\N']='0'\n", + "train_ad.product_id = train_ad.product_id.astype(int)\n", + "train_ad.industry = train_ad.industry.astype(int)" ] }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "test_ad = pd.read_csv('data/test/ad.csv')\n", + "test_ad.product_id[test_ad.product_id=='\\\\N']='0'\n", + "test_ad.industry[test_ad.industry=='\\\\N']='0'\n", + "test_ad.product_id = test_ad.product_id.astype(int)\n", + "test_ad.industry = test_ad.industry.astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "frame = [train_ad, test_ad]\n", + "ad_train_and_test = pd.concat(frame, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { - "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 \\N 5 381 78\n1 4 4 \\N 5 108 202\n2 7 7 \\N 5 148 297\n3 8 8 \\N 5 713 213\n4 9 9 \\N 5 695 213", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
creative_idad_idproduct_idproduct_categoryadvertiser_idindustry
011\\N538178
144\\N5108202
277\\N5148297
388\\N5713213
499\\N5695213
\n
" + "text/plain": "(5099294, 6)" }, "metadata": {}, - "execution_count": 162 + "execution_count": 27 } ], "source": [ - "train_ad.head(5)" + "ad_train_and_test.shape" ] }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "ad_unique=ad_train_and_test.drop_duplicates(subset = None, \n", + " keep = 'first', inplace = False) " + ] + }, + { + "cell_type": "code", + "execution_count": 31, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { - "text/plain": "creative_id int64\nad_id int64\nproduct_id object\nproduct_category int64\nadvertiser_id int64\nindustry object\ndtype: object" + "text/plain": "(3412772, 6)" }, "metadata": {}, - "execution_count": 163 + "execution_count": 31 } ], "source": [ - "train_ad.dtypes" + "ad_unique.shape" ] }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(2481135, 6)" + }, + "metadata": {}, + "execution_count": 23 + } + ], "source": [ - "train_ad.product_id[train_ad.product_id=='\\\\N']='0'\n", - "train_ad.industry[train_ad.industry=='\\\\N']='0'" + "ad_unique.shape" ] }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "((2481135, 6), (2481135, 6))" + }, + "metadata": {}, + "execution_count": 17 + } + ], "source": [ - "train_ad.product_id = train_ad.product_id.astype(int)\n", - "train_ad.industry = train_ad.industry.astype(int)" + "train_ad.shape, test_ad.shape" ] }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 0 5 381 78\n1 4 4 0 5 108 202", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
creative_idad_idproduct_idproduct_categoryadvertiser_idindustry
0110538178
14405108202
\n
" + }, + "metadata": {}, + "execution_count": 16 + } + ], + "source": [ + "test_ad.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 0 5 381 78\n1 4 4 0 5 108 202", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
creative_idad_idproduct_idproduct_categoryadvertiser_idindustry
0110538178
14405108202
\n
" + }, + "metadata": {}, + "execution_count": 13 + } + ], + "source": [ + "train_ad.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(creative_id int64\n ad_id int64\n product_id int64\n product_category int64\n advertiser_id int64\n industry int64\n dtype: object, creative_id int64\n ad_id int64\n product_id int64\n product_category int64\n advertiser_id int64\n industry int64\n dtype: object)" + }, + "metadata": {}, + "execution_count": 18 + } + ], + "source": [ + "train_ad.dtypes,test_ad.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 166, @@ -214,17 +408,17 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 10, "metadata": {}, "outputs": [ { "output_type": "execute_result", "data": { - "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 9 30920 567330 1 504423 30673 \n1 65 30920 3072255 1 2642300 1261 \n2 56 30920 2361327 1 2035918 1261 \n3 6 309204 325532 1 292523 27081 \n4 59 309204 2746730 1 2362208 0 \n\n product_category advertiser_id industry \n0 3 32638 319 \n1 2 6783 6 \n2 2 6783 6 \n3 3 32066 242 \n4 18 14682 88 ", - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timeuser_idcreative_idclick_timesad_idproduct_idproduct_categoryadvertiser_idindustry
0930920567330150442330673332638319
165309203072255126423001261267836
256309202361327120359181261267836
36309204325532129252327081332066242
4593092042746730123622080181468288
\n
" + "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 9 30920 567330 1 504423 30673 \n1 65 30920 3072255 1 2642300 1261 \n2 56 30920 2361327 1 2035918 1261 \n3 6 309204 325532 1 292523 27081 \n4 59 309204 2746730 1 2362208 \\N \n\n product_category advertiser_id industry \n0 3 32638 319 \n1 2 6783 6 \n2 2 6783 6 \n3 3 32066 242 \n4 18 14682 88 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timeuser_idcreative_idclick_timesad_idproduct_idproduct_categoryadvertiser_idindustry
0930920567330150442330673332638319
165309203072255126423001261267836
256309202361327120359181261267836
36309204325532129252327081332066242
459309204274673012362208\\N181468288
\n
" }, "metadata": {}, - "execution_count": 169 + "execution_count": 10 } ], "source": [ @@ -232,6 +426,24 @@ "train_clicklog_ad.head()" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(30082771, 9)" + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "train_clicklog_ad.shape" + ] + }, { "cell_type": "code", "execution_count": 170, @@ -298,11 +510,7 @@ "execution_count": 144, "metadata": {}, "outputs": [], - "source": [ - "# train_tmp_click_log = 'data/train_tmp/click_log.csv'\n", - "test_click_log_path = 'data/test/click_log.csv'\n", - "test_click_log = pd.read_csv(test_click_log_path)" - ] + "source": [] }, { "cell_type": "code", @@ -342,21 +550,14 @@ "execution_count": 147, "metadata": {}, "outputs": [], - "source": [ - "test_ad = pd.read_csv('data/train_preliminary/ad.csv')" - ] + "source": [] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [], - "source": [ - "test_ad.product_id[test_ad.product_id=='\\\\N']='0'\n", - "test_ad.industry[test_ad.industry=='\\\\N']='0'\n", - "test_ad.product_id = test_ad.product_id.astype(int)\n", - "test_ad.industry = test_ad.industry.astype(int)" - ] + "source": [] }, { "cell_type": "code", diff --git a/process_data_old.ipynb b/process_data_old.ipynb new file mode 100644 index 0000000..1eb61ab --- /dev/null +++ b/process_data_old.ipynb @@ -0,0 +1,626 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# train data process" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# train_tmp_click_log = 'data/train_tmp/click_log.csv'\n", + "train_click_log_path = 'data/train_preliminary/click_log.csv'\n", + "train_click_log = pd.read_csv(train_click_log_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " time user_id creative_id click_times\n0 9 30920 567330 1\n1 65 30920 3072255 1\n2 56 30920 2361327 1", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timeuser_idcreative_idclick_times
09309205673301
1653092030722551
2563092023613271
\n
" + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "train_click_log.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "time int64\nuser_id int64\ncreative_id int64\nclick_times int64\ndtype: object" + }, + "metadata": {}, + "execution_count": 4 + } + ], + "source": [ + "train_click_log.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "((91,), (900000,), (2481135,), (41,))" + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "train_click_log.time.unique().shape, train_click_log.user_id.unique().shape, train_click_log.creative_id.unique().shape, train_click_log.click_times.unique().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(30082771, 4)" + }, + "metadata": {}, + "execution_count": 6 + } + ], + "source": [ + "train_click_log.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "train_ad = pd.read_csv('data/train_preliminary/ad.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 \\N 5 381 78\n1 4 4 \\N 5 108 202\n2 7 7 \\N 5 148 297\n3 8 8 \\N 5 713 213\n4 9 9 \\N 5 695 213", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
creative_idad_idproduct_idproduct_categoryadvertiser_idindustry
011\\N538178
144\\N5108202
277\\N5148297
388\\N5713213
499\\N5695213
\n
" + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "train_ad.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(2481135, 6)" + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "train_ad.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "creative_id int64\nad_id int64\nproduct_id object\nproduct_category int64\nadvertiser_id int64\nindustry object\ndtype: object" + }, + "metadata": {}, + "execution_count": 163 + } + ], + "source": [ + "train_ad.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [], + "source": [ + "train_ad.product_id[train_ad.product_id=='\\\\N']='0'\n", + "train_ad.industry[train_ad.industry=='\\\\N']='0'" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [], + "source": [ + "train_ad.product_id = train_ad.product_id.astype(int)\n", + "train_ad.industry = train_ad.industry.astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "creative_id int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\ndtype: object" + }, + "metadata": {}, + "execution_count": 166 + } + ], + "source": [ + "train_ad.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " user_id age gender\n0 1 4 1\n1 2 10 1", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
user_idagegender
0141
12101
\n
" + }, + "metadata": {}, + "execution_count": 167 + } + ], + "source": [ + "train_user = pd.read_csv('data/train_preliminary/user.csv')\n", + "train_user.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "user_id int64\nage int64\ngender int64\ndtype: object" + }, + "metadata": {}, + "execution_count": 168 + } + ], + "source": [ + "train_user.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## merge user.csv ad.csv to click_log.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 9 30920 567330 1 504423 30673 \n1 65 30920 3072255 1 2642300 1261 \n2 56 30920 2361327 1 2035918 1261 \n3 6 309204 325532 1 292523 27081 \n4 59 309204 2746730 1 2362208 \\N \n\n product_category advertiser_id industry \n0 3 32638 319 \n1 2 6783 6 \n2 2 6783 6 \n3 3 32066 242 \n4 18 14682 88 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timeuser_idcreative_idclick_timesad_idproduct_idproduct_categoryadvertiser_idindustry
0930920567330150442330673332638319
165309203072255126423001261267836
256309202361327120359181261267836
36309204325532129252327081332066242
459309204274673012362208\\N181468288
\n
" + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "train_clicklog_ad = pd.merge(train_click_log, train_ad, on='creative_id', how='left')\n", + "train_clicklog_ad.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(30082771, 9)" + }, + "metadata": {}, + "execution_count": 11 + } + ], + "source": [ + "train_clicklog_ad.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 9 30920 567330 1 504423 30673 \n1 65 30920 3072255 1 2642300 1261 \n2 56 30920 2361327 1 2035918 1261 \n3 6 309204 325532 1 292523 27081 \n4 59 309204 2746730 1 2362208 0 \n\n product_category advertiser_id industry age gender \n0 3 32638 319 2 1 \n1 2 6783 6 2 1 \n2 2 6783 6 2 1 \n3 3 32066 242 6 1 \n4 18 14682 88 6 1 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timeuser_idcreative_idclick_timesad_idproduct_idproduct_categoryadvertiser_idindustryagegender
093092056733015044233067333263831921
16530920307225512642300126126783621
25630920236132712035918126126783621
3630920432553212925232708133206624261
459309204274673012362208018146828861
\n
" + }, + "metadata": {}, + "execution_count": 170 + } + ], + "source": [ + "train_clicklog_ad_user = pd.merge(train_clicklog_ad, train_user, on='user_id', how='left')\n", + "train_clicklog_ad_user.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "time int64\nuser_id int64\ncreative_id int64\nclick_times int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\nage int64\ngender int64\ndtype: object" + }, + "metadata": {}, + "execution_count": 171 + } + ], + "source": [ + "train_clicklog_ad_user.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## write to csv file" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# train_clicklog_ad_user.to_csv('data/train_preliminary/clicklog_ad_user.csv',index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test data process" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "# train_tmp_click_log = 'data/train_tmp/click_log.csv'\n", + "test_click_log_path = 'data/test/click_log.csv'\n", + "test_click_log = pd.read_csv(test_click_log_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "((91,), (1000000,), (2618159,), (93,))" + }, + "metadata": {}, + "execution_count": 145 + } + ], + "source": [ + "test_click_log.time.unique().shape, test_click_log.user_id.unique().shape, test_click_log.creative_id.unique().shape, test_click_log.click_times.unique().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "\nRangeIndex: 33585512 entries, 0 to 33585511\nData columns (total 4 columns):\ntime int64\nuser_id int64\ncreative_id int64\nclick_times int64\ndtypes: int64(4)\nmemory usage: 1.0 GB\n" + } + ], + "source": [ + "test_click_log.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [], + "source": [ + "test_ad = pd.read_csv('data/train_preliminary/ad.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [], + "source": [ + "test_ad.product_id[test_ad.product_id=='\\\\N']='0'\n", + "test_ad.industry[test_ad.industry=='\\\\N']='0'\n", + "test_ad.product_id = test_ad.product_id.astype(int)\n", + "test_ad.industry = test_ad.industry.astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "creative_id int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\ndtype: object" + }, + "metadata": {}, + "execution_count": 149 + } + ], + "source": [ + "test_ad.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 0 5 381 78\n1 4 4 0 5 108 202\n2 7 7 0 5 148 297\n3 8 8 0 5 713 213\n4 9 9 0 5 695 213", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
creative_idad_idproduct_idproduct_categoryadvertiser_idindustry
0110538178
14405108202
27705148297
38805713213
49905695213
\n
" + }, + "metadata": {}, + "execution_count": 150 + } + ], + "source": [ + "test_ad.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 在click_log.csv中有20多万的creative_id没有在ad.csv出现,使用inner方式保留两个表的公共creative_id的行" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 20 3131989 645764 1 573314 58 \n1 10 3142948 645764 1 573314 58 \n2 14 3170643 645764 1 573314 58 \n3 10 3194257 645764 1 573314 58 \n4 21 3222925 645764 1 573314 58 \n\n product_category advertiser_id industry \n0 2 14689 6 \n1 2 14689 6 \n2 2 14689 6 \n3 2 14689 6 \n4 2 14689 6 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timeuser_idcreative_idclick_timesad_idproduct_idproduct_categoryadvertiser_idindustry
02031319896457641573314582146896
11031429486457641573314582146896
21431706436457641573314582146896
31031942576457641573314582146896
42132229256457641573314582146896
\n
" + }, + "metadata": {}, + "execution_count": 151 + } + ], + "source": [ + "# test_clicklog_ad = pd.merge(test_click_log, test_ad, on='creative_id', how='left')\n", + "# test_click_ad_dropna = test_clicklog_ad.dropna()\n", + "test_clicklog_ad = pd.merge(test_click_log, test_ad, on='creative_id', how='inner')\n", + "test_clicklog_ad.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [], + "source": [ + "# 类型转换为int\n", + "# test_clicklog_ad_drop_userid.ad_id = test_clicklog_ad_drop_userid.product_id.astype(int)\n", + "# test_clicklog_ad_drop_userid.product_id = test_clicklog_ad_drop_userid.product_id.astype(int)\n", + "# test_clicklog_ad_drop_userid.product_category = test_clicklog_ad_drop_userid.product_id.astype(int)\n", + "# test_clicklog_ad_drop_userid.advertiser_id = test_clicklog_ad_drop_userid.product_id.astype(int)\n", + "# test_clicklog_ad_drop_userid.industry = test_clicklog_ad_drop_userid.product_id.astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "\nInt64Index: 32310439 entries, 0 to 32310438\nData columns (total 9 columns):\ntime int64\nuser_id int64\ncreative_id int64\nclick_times int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\ndtypes: int64(9)\nmemory usage: 2.4 GB\n" + } + ], + "source": [ + "test_clicklog_ad.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "\nInt64Index: 32310439 entries, 0 to 32310438\nData columns (total 9 columns):\ntime int64\nuser_id int64\ncreative_id int64\nclick_times int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\ndtypes: int64(9)\nmemory usage: 2.4 GB\n" + } + ], + "source": [ + "test_clicklog_ad.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "test_clicklog_ad.to_csv('data/test/clicklog_ad.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [], + "source": [ + "# 删除重复user_id\n", + "# test_clicklog_ad_drop_userid = test_click_ad_dropna.drop_duplicates('user_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_clicklog_ad_drop_userid.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_clicklog_ad_drop_userid.to_csv('data/test/clicklog_ad.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "\nInt64Index: 30082771 entries, 0 to 30082770\nData columns (total 11 columns):\ntime int64\nuser_id int64\ncreative_id int64\nclick_times int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\nage int64\ngender int64\ndtypes: int64(11)\nmemory usage: 2.7 GB\n" + } + ], + "source": [ + "train_clicklog_ad_user.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 9 30920 567330 1 504423 30673 \n1 65 30920 3072255 1 2642300 1261 \n2 56 30920 2361327 1 2035918 1261 \n3 6 309204 325532 1 292523 27081 \n4 59 309204 2746730 1 2362208 0 \n\n product_category advertiser_id industry age gender \n0 3 32638 319 2 1 \n1 2 6783 6 2 1 \n2 2 6783 6 2 1 \n3 3 32066 242 6 1 \n4 18 14682 88 6 1 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timeuser_idcreative_idclick_timesad_idproduct_idproduct_categoryadvertiser_idindustryagegender
093092056733015044233067333263831921
16530920307225512642300126126783621
25630920236132712035918126126783621
3630920432553212925232708133206624261
459309204274673012362208018146828861
\n
" + }, + "metadata": {}, + "execution_count": 173 + } + ], + "source": [ + "train_clicklog_ad_user.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python36564bit792083a9d155497086f5b8bc917c01d5", + "display_name": "Python 3.6.5 64-bit" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/word2vec_creative_id.py b/word2vec_creative_id.py index e9acc22..e5e8450 100644 --- a/word2vec_creative_id.py +++ b/word2vec_creative_id.py @@ -9,7 +9,7 @@ from gensim.models import KeyedVectors from gensim.test.utils import common_texts, get_tmpfile import pickle -from mail import mail +from mymail import mail # %% df_train = pd.read_csv( 'data/train_preliminary/clicklog_ad_user_train_eval_test.csv')