diff --git a/.vscode/settings.json b/.vscode/settings.json
index 14bd17e..4da6104 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +1,3 @@
{
- "python.pythonPath": "/root/anaconda3/envs/python377/bin/python"
+ "python.pythonPath": "/Users/sunlanchang/anaconda3/envs/dev/bin/python"
}
\ No newline at end of file
diff --git a/process_data.ipynb b/process_data.ipynb
index a08ed30..d1f89f2 100644
--- a/process_data.ipynb
+++ b/process_data.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -19,7 +19,7 @@
},
{
"cell_type": "code",
- "execution_count": 157,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -30,7 +30,64 @@
},
{
"cell_type": "code",
- "execution_count": 158,
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# train_tmp_click_log = 'data/train_tmp/click_log.csv'\n",
+ "test_click_log_path = 'data/test/click_log.csv'\n",
+ "test_click_log = pd.read_csv(test_click_log_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "((30082771, 4), (33585512, 4))"
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ],
+ "source": [
+ "train_click_log.shape,test_click_log.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "frame = [train_click_log, test_click_log]\n",
+ "click_log_train_and_test = pd.concat(frame, ignore_index=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "(63668283, 4)"
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ],
+ "source": [
+ "click_log_train_and_test.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -40,7 +97,7 @@
"text/html": "
\n\n
\n \n \n | \n time | \n user_id | \n creative_id | \n click_times | \n
\n \n \n \n 0 | \n 9 | \n 30920 | \n 567330 | \n 1 | \n
\n \n 1 | \n 65 | \n 30920 | \n 3072255 | \n 1 | \n
\n \n 2 | \n 56 | \n 30920 | \n 2361327 | \n 1 | \n
\n \n
\n
"
},
"metadata": {},
- "execution_count": 158
+ "execution_count": 3
}
],
"source": [
@@ -49,25 +106,26 @@
},
{
"cell_type": "code",
- "execution_count": 159,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
- "text/plain": "time int64\nuser_id int64\ncreative_id int64\nclick_times int64\ndtype: object"
+ "text/plain": " time user_id creative_id click_times\n0 20 3131989 645764 1\n1 20 3131989 1027422 1\n2 20 3131989 1106443 1",
+ "text/html": "\n\n
\n \n \n | \n time | \n user_id | \n creative_id | \n click_times | \n
\n \n \n \n 0 | \n 20 | \n 3131989 | \n 645764 | \n 1 | \n
\n \n 1 | \n 20 | \n 3131989 | \n 1027422 | \n 1 | \n
\n \n 2 | \n 20 | \n 3131989 | \n 1106443 | \n 1 | \n
\n \n
\n
"
},
"metadata": {},
- "execution_count": 159
+ "execution_count": 6
}
],
"source": [
- "train_click_log.dtypes"
+ "test_click_log.head(3)"
]
},
{
"cell_type": "code",
- "execution_count": 160,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -76,79 +134,215 @@
"text/plain": "((91,), (900000,), (2481135,), (41,))"
},
"metadata": {},
- "execution_count": 160
+ "execution_count": 5
}
],
"source": [
"train_click_log.time.unique().shape, train_click_log.user_id.unique().shape, train_click_log.creative_id.unique().shape, train_click_log.click_times.unique().shape"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 处理ad"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 161,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
- "train_ad = pd.read_csv('data/train_preliminary/ad.csv')"
+ "train_ad = pd.read_csv('data/train_preliminary/ad.csv')\n",
+ "train_ad.product_id[train_ad.product_id=='\\\\N']='0'\n",
+ "train_ad.industry[train_ad.industry=='\\\\N']='0'\n",
+ "train_ad.product_id = train_ad.product_id.astype(int)\n",
+ "train_ad.industry = train_ad.industry.astype(int)"
]
},
{
"cell_type": "code",
- "execution_count": 162,
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_ad = pd.read_csv('data/test/ad.csv')\n",
+ "test_ad.product_id[test_ad.product_id=='\\\\N']='0'\n",
+ "test_ad.industry[test_ad.industry=='\\\\N']='0'\n",
+ "test_ad.product_id = test_ad.product_id.astype(int)\n",
+ "test_ad.industry = test_ad.industry.astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "frame = [train_ad, test_ad]\n",
+ "ad_train_and_test = pd.concat(frame, ignore_index=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
- "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 \\N 5 381 78\n1 4 4 \\N 5 108 202\n2 7 7 \\N 5 148 297\n3 8 8 \\N 5 713 213\n4 9 9 \\N 5 695 213",
- "text/html": "\n\n
\n \n \n | \n creative_id | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n
\n \n \n \n 0 | \n 1 | \n 1 | \n \\N | \n 5 | \n 381 | \n 78 | \n
\n \n 1 | \n 4 | \n 4 | \n \\N | \n 5 | \n 108 | \n 202 | \n
\n \n 2 | \n 7 | \n 7 | \n \\N | \n 5 | \n 148 | \n 297 | \n
\n \n 3 | \n 8 | \n 8 | \n \\N | \n 5 | \n 713 | \n 213 | \n
\n \n 4 | \n 9 | \n 9 | \n \\N | \n 5 | \n 695 | \n 213 | \n
\n \n
\n
"
+ "text/plain": "(5099294, 6)"
},
"metadata": {},
- "execution_count": 162
+ "execution_count": 27
}
],
"source": [
- "train_ad.head(5)"
+ "ad_train_and_test.shape"
]
},
{
"cell_type": "code",
- "execution_count": 163,
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ad_unique=ad_train_and_test.drop_duplicates(subset = None, \n",
+ " keep = 'first', inplace = False) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
- "text/plain": "creative_id int64\nad_id int64\nproduct_id object\nproduct_category int64\nadvertiser_id int64\nindustry object\ndtype: object"
+ "text/plain": "(3412772, 6)"
},
"metadata": {},
- "execution_count": 163
+ "execution_count": 31
}
],
"source": [
- "train_ad.dtypes"
+ "ad_unique.shape"
]
},
{
"cell_type": "code",
- "execution_count": 164,
+ "execution_count": 23,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "(2481135, 6)"
+ },
+ "metadata": {},
+ "execution_count": 23
+ }
+ ],
"source": [
- "train_ad.product_id[train_ad.product_id=='\\\\N']='0'\n",
- "train_ad.industry[train_ad.industry=='\\\\N']='0'"
+ "ad_unique.shape"
]
},
{
"cell_type": "code",
- "execution_count": 165,
+ "execution_count": 17,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "((2481135, 6), (2481135, 6))"
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ],
"source": [
- "train_ad.product_id = train_ad.product_id.astype(int)\n",
- "train_ad.industry = train_ad.industry.astype(int)"
+ "train_ad.shape, test_ad.shape"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 0 5 381 78\n1 4 4 0 5 108 202",
+ "text/html": "\n\n
\n \n \n | \n creative_id | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n
\n \n \n \n 0 | \n 1 | \n 1 | \n 0 | \n 5 | \n 381 | \n 78 | \n
\n \n 1 | \n 4 | \n 4 | \n 0 | \n 5 | \n 108 | \n 202 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ],
+ "source": [
+ "test_ad.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 0 5 381 78\n1 4 4 0 5 108 202",
+ "text/html": "\n\n
\n \n \n | \n creative_id | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n
\n \n \n \n 0 | \n 1 | \n 1 | \n 0 | \n 5 | \n 381 | \n 78 | \n
\n \n 1 | \n 4 | \n 4 | \n 0 | \n 5 | \n 108 | \n 202 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ],
+ "source": [
+ "train_ad.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "(creative_id int64\n ad_id int64\n product_id int64\n product_category int64\n advertiser_id int64\n industry int64\n dtype: object, creative_id int64\n ad_id int64\n product_id int64\n product_category int64\n advertiser_id int64\n industry int64\n dtype: object)"
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ],
+ "source": [
+ "train_ad.dtypes,test_ad.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 164,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
{
"cell_type": "code",
"execution_count": 166,
@@ -214,17 +408,17 @@
},
{
"cell_type": "code",
- "execution_count": 169,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
- "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 9 30920 567330 1 504423 30673 \n1 65 30920 3072255 1 2642300 1261 \n2 56 30920 2361327 1 2035918 1261 \n3 6 309204 325532 1 292523 27081 \n4 59 309204 2746730 1 2362208 0 \n\n product_category advertiser_id industry \n0 3 32638 319 \n1 2 6783 6 \n2 2 6783 6 \n3 3 32066 242 \n4 18 14682 88 ",
- "text/html": "\n\n
\n \n \n | \n time | \n user_id | \n creative_id | \n click_times | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n
\n \n \n \n 0 | \n 9 | \n 30920 | \n 567330 | \n 1 | \n 504423 | \n 30673 | \n 3 | \n 32638 | \n 319 | \n
\n \n 1 | \n 65 | \n 30920 | \n 3072255 | \n 1 | \n 2642300 | \n 1261 | \n 2 | \n 6783 | \n 6 | \n
\n \n 2 | \n 56 | \n 30920 | \n 2361327 | \n 1 | \n 2035918 | \n 1261 | \n 2 | \n 6783 | \n 6 | \n
\n \n 3 | \n 6 | \n 309204 | \n 325532 | \n 1 | \n 292523 | \n 27081 | \n 3 | \n 32066 | \n 242 | \n
\n \n 4 | \n 59 | \n 309204 | \n 2746730 | \n 1 | \n 2362208 | \n 0 | \n 18 | \n 14682 | \n 88 | \n
\n \n
\n
"
+ "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 9 30920 567330 1 504423 30673 \n1 65 30920 3072255 1 2642300 1261 \n2 56 30920 2361327 1 2035918 1261 \n3 6 309204 325532 1 292523 27081 \n4 59 309204 2746730 1 2362208 \\N \n\n product_category advertiser_id industry \n0 3 32638 319 \n1 2 6783 6 \n2 2 6783 6 \n3 3 32066 242 \n4 18 14682 88 ",
+ "text/html": "\n\n
\n \n \n | \n time | \n user_id | \n creative_id | \n click_times | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n
\n \n \n \n 0 | \n 9 | \n 30920 | \n 567330 | \n 1 | \n 504423 | \n 30673 | \n 3 | \n 32638 | \n 319 | \n
\n \n 1 | \n 65 | \n 30920 | \n 3072255 | \n 1 | \n 2642300 | \n 1261 | \n 2 | \n 6783 | \n 6 | \n
\n \n 2 | \n 56 | \n 30920 | \n 2361327 | \n 1 | \n 2035918 | \n 1261 | \n 2 | \n 6783 | \n 6 | \n
\n \n 3 | \n 6 | \n 309204 | \n 325532 | \n 1 | \n 292523 | \n 27081 | \n 3 | \n 32066 | \n 242 | \n
\n \n 4 | \n 59 | \n 309204 | \n 2746730 | \n 1 | \n 2362208 | \n \\N | \n 18 | \n 14682 | \n 88 | \n
\n \n
\n
"
},
"metadata": {},
- "execution_count": 169
+ "execution_count": 10
}
],
"source": [
@@ -232,6 +426,24 @@
"train_clicklog_ad.head()"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "(30082771, 9)"
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ],
+ "source": [
+ "train_clicklog_ad.shape"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 170,
@@ -298,11 +510,7 @@
"execution_count": 144,
"metadata": {},
"outputs": [],
- "source": [
- "# train_tmp_click_log = 'data/train_tmp/click_log.csv'\n",
- "test_click_log_path = 'data/test/click_log.csv'\n",
- "test_click_log = pd.read_csv(test_click_log_path)"
- ]
+ "source": []
},
{
"cell_type": "code",
@@ -342,21 +550,14 @@
"execution_count": 147,
"metadata": {},
"outputs": [],
- "source": [
- "test_ad = pd.read_csv('data/train_preliminary/ad.csv')"
- ]
+ "source": []
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [],
- "source": [
- "test_ad.product_id[test_ad.product_id=='\\\\N']='0'\n",
- "test_ad.industry[test_ad.industry=='\\\\N']='0'\n",
- "test_ad.product_id = test_ad.product_id.astype(int)\n",
- "test_ad.industry = test_ad.industry.astype(int)"
- ]
+ "source": []
},
{
"cell_type": "code",
diff --git a/process_data_old.ipynb b/process_data_old.ipynb
new file mode 100644
index 0000000..1eb61ab
--- /dev/null
+++ b/process_data_old.ipynb
@@ -0,0 +1,626 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# train data process"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# train_tmp_click_log = 'data/train_tmp/click_log.csv'\n",
+ "train_click_log_path = 'data/train_preliminary/click_log.csv'\n",
+ "train_click_log = pd.read_csv(train_click_log_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " time user_id creative_id click_times\n0 9 30920 567330 1\n1 65 30920 3072255 1\n2 56 30920 2361327 1",
+ "text/html": "\n\n
\n \n \n | \n time | \n user_id | \n creative_id | \n click_times | \n
\n \n \n \n 0 | \n 9 | \n 30920 | \n 567330 | \n 1 | \n
\n \n 1 | \n 65 | \n 30920 | \n 3072255 | \n 1 | \n
\n \n 2 | \n 56 | \n 30920 | \n 2361327 | \n 1 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 3
+ }
+ ],
+ "source": [
+ "train_click_log.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "time int64\nuser_id int64\ncreative_id int64\nclick_times int64\ndtype: object"
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ],
+ "source": [
+ "train_click_log.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "((91,), (900000,), (2481135,), (41,))"
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ],
+ "source": [
+ "train_click_log.time.unique().shape, train_click_log.user_id.unique().shape, train_click_log.creative_id.unique().shape, train_click_log.click_times.unique().shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "(30082771, 4)"
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "source": [
+ "train_click_log.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_ad = pd.read_csv('data/train_preliminary/ad.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 \\N 5 381 78\n1 4 4 \\N 5 108 202\n2 7 7 \\N 5 148 297\n3 8 8 \\N 5 713 213\n4 9 9 \\N 5 695 213",
+ "text/html": "\n\n
\n \n \n | \n creative_id | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n
\n \n \n \n 0 | \n 1 | \n 1 | \n \\N | \n 5 | \n 381 | \n 78 | \n
\n \n 1 | \n 4 | \n 4 | \n \\N | \n 5 | \n 108 | \n 202 | \n
\n \n 2 | \n 7 | \n 7 | \n \\N | \n 5 | \n 148 | \n 297 | \n
\n \n 3 | \n 8 | \n 8 | \n \\N | \n 5 | \n 713 | \n 213 | \n
\n \n 4 | \n 9 | \n 9 | \n \\N | \n 5 | \n 695 | \n 213 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ],
+ "source": [
+ "train_ad.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "(2481135, 6)"
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ],
+ "source": [
+ "train_ad.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 163,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "creative_id int64\nad_id int64\nproduct_id object\nproduct_category int64\nadvertiser_id int64\nindustry object\ndtype: object"
+ },
+ "metadata": {},
+ "execution_count": 163
+ }
+ ],
+ "source": [
+ "train_ad.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 164,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_ad.product_id[train_ad.product_id=='\\\\N']='0'\n",
+ "train_ad.industry[train_ad.industry=='\\\\N']='0'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 165,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train_ad.product_id = train_ad.product_id.astype(int)\n",
+ "train_ad.industry = train_ad.industry.astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "creative_id int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\ndtype: object"
+ },
+ "metadata": {},
+ "execution_count": 166
+ }
+ ],
+ "source": [
+ "train_ad.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 167,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " user_id age gender\n0 1 4 1\n1 2 10 1",
+ "text/html": "\n\n
\n \n \n | \n user_id | \n age | \n gender | \n
\n \n \n \n 0 | \n 1 | \n 4 | \n 1 | \n
\n \n 1 | \n 2 | \n 10 | \n 1 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 167
+ }
+ ],
+ "source": [
+ "train_user = pd.read_csv('data/train_preliminary/user.csv')\n",
+ "train_user.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 168,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "user_id int64\nage int64\ngender int64\ndtype: object"
+ },
+ "metadata": {},
+ "execution_count": 168
+ }
+ ],
+ "source": [
+ "train_user.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## merge user.csv ad.csv to click_log.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 9 30920 567330 1 504423 30673 \n1 65 30920 3072255 1 2642300 1261 \n2 56 30920 2361327 1 2035918 1261 \n3 6 309204 325532 1 292523 27081 \n4 59 309204 2746730 1 2362208 \\N \n\n product_category advertiser_id industry \n0 3 32638 319 \n1 2 6783 6 \n2 2 6783 6 \n3 3 32066 242 \n4 18 14682 88 ",
+ "text/html": "\n\n
\n \n \n | \n time | \n user_id | \n creative_id | \n click_times | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n
\n \n \n \n 0 | \n 9 | \n 30920 | \n 567330 | \n 1 | \n 504423 | \n 30673 | \n 3 | \n 32638 | \n 319 | \n
\n \n 1 | \n 65 | \n 30920 | \n 3072255 | \n 1 | \n 2642300 | \n 1261 | \n 2 | \n 6783 | \n 6 | \n
\n \n 2 | \n 56 | \n 30920 | \n 2361327 | \n 1 | \n 2035918 | \n 1261 | \n 2 | \n 6783 | \n 6 | \n
\n \n 3 | \n 6 | \n 309204 | \n 325532 | \n 1 | \n 292523 | \n 27081 | \n 3 | \n 32066 | \n 242 | \n
\n \n 4 | \n 59 | \n 309204 | \n 2746730 | \n 1 | \n 2362208 | \n \\N | \n 18 | \n 14682 | \n 88 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ],
+ "source": [
+ "train_clicklog_ad = pd.merge(train_click_log, train_ad, on='creative_id', how='left')\n",
+ "train_clicklog_ad.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "(30082771, 9)"
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ],
+ "source": [
+ "train_clicklog_ad.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 9 30920 567330 1 504423 30673 \n1 65 30920 3072255 1 2642300 1261 \n2 56 30920 2361327 1 2035918 1261 \n3 6 309204 325532 1 292523 27081 \n4 59 309204 2746730 1 2362208 0 \n\n product_category advertiser_id industry age gender \n0 3 32638 319 2 1 \n1 2 6783 6 2 1 \n2 2 6783 6 2 1 \n3 3 32066 242 6 1 \n4 18 14682 88 6 1 ",
+ "text/html": "\n\n
\n \n \n | \n time | \n user_id | \n creative_id | \n click_times | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n age | \n gender | \n
\n \n \n \n 0 | \n 9 | \n 30920 | \n 567330 | \n 1 | \n 504423 | \n 30673 | \n 3 | \n 32638 | \n 319 | \n 2 | \n 1 | \n
\n \n 1 | \n 65 | \n 30920 | \n 3072255 | \n 1 | \n 2642300 | \n 1261 | \n 2 | \n 6783 | \n 6 | \n 2 | \n 1 | \n
\n \n 2 | \n 56 | \n 30920 | \n 2361327 | \n 1 | \n 2035918 | \n 1261 | \n 2 | \n 6783 | \n 6 | \n 2 | \n 1 | \n
\n \n 3 | \n 6 | \n 309204 | \n 325532 | \n 1 | \n 292523 | \n 27081 | \n 3 | \n 32066 | \n 242 | \n 6 | \n 1 | \n
\n \n 4 | \n 59 | \n 309204 | \n 2746730 | \n 1 | \n 2362208 | \n 0 | \n 18 | \n 14682 | \n 88 | \n 6 | \n 1 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 170
+ }
+ ],
+ "source": [
+ "train_clicklog_ad_user = pd.merge(train_clicklog_ad, train_user, on='user_id', how='left')\n",
+ "train_clicklog_ad_user.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "time int64\nuser_id int64\ncreative_id int64\nclick_times int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\nage int64\ngender int64\ndtype: object"
+ },
+ "metadata": {},
+ "execution_count": 171
+ }
+ ],
+ "source": [
+ "train_clicklog_ad_user.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## write to csv file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# train_clicklog_ad_user.to_csv('data/train_preliminary/clicklog_ad_user.csv',index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Test data process"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 144,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# train_tmp_click_log = 'data/train_tmp/click_log.csv'\n",
+ "test_click_log_path = 'data/test/click_log.csv'\n",
+ "test_click_log = pd.read_csv(test_click_log_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 145,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "((91,), (1000000,), (2618159,), (93,))"
+ },
+ "metadata": {},
+ "execution_count": 145
+ }
+ ],
+ "source": [
+ "test_click_log.time.unique().shape, test_click_log.user_id.unique().shape, test_click_log.creative_id.unique().shape, test_click_log.click_times.unique().shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 146,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "\nRangeIndex: 33585512 entries, 0 to 33585511\nData columns (total 4 columns):\ntime int64\nuser_id int64\ncreative_id int64\nclick_times int64\ndtypes: int64(4)\nmemory usage: 1.0 GB\n"
+ }
+ ],
+ "source": [
+ "test_click_log.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 147,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_ad = pd.read_csv('data/train_preliminary/ad.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 148,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "test_ad.product_id[test_ad.product_id=='\\\\N']='0'\n",
+ "test_ad.industry[test_ad.industry=='\\\\N']='0'\n",
+ "test_ad.product_id = test_ad.product_id.astype(int)\n",
+ "test_ad.industry = test_ad.industry.astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 149,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": "creative_id int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\ndtype: object"
+ },
+ "metadata": {},
+ "execution_count": 149
+ }
+ ],
+ "source": [
+ "test_ad.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 150,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 0 5 381 78\n1 4 4 0 5 108 202\n2 7 7 0 5 148 297\n3 8 8 0 5 713 213\n4 9 9 0 5 695 213",
+ "text/html": "\n\n
\n \n \n | \n creative_id | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n
\n \n \n \n 0 | \n 1 | \n 1 | \n 0 | \n 5 | \n 381 | \n 78 | \n
\n \n 1 | \n 4 | \n 4 | \n 0 | \n 5 | \n 108 | \n 202 | \n
\n \n 2 | \n 7 | \n 7 | \n 0 | \n 5 | \n 148 | \n 297 | \n
\n \n 3 | \n 8 | \n 8 | \n 0 | \n 5 | \n 713 | \n 213 | \n
\n \n 4 | \n 9 | \n 9 | \n 0 | \n 5 | \n 695 | \n 213 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 150
+ }
+ ],
+ "source": [
+ "test_ad.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 在click_log.csv中有20多万的creative_id没有在ad.csv出现,使用inner方式保留两个表的公共creative_id的行"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 151,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 20 3131989 645764 1 573314 58 \n1 10 3142948 645764 1 573314 58 \n2 14 3170643 645764 1 573314 58 \n3 10 3194257 645764 1 573314 58 \n4 21 3222925 645764 1 573314 58 \n\n product_category advertiser_id industry \n0 2 14689 6 \n1 2 14689 6 \n2 2 14689 6 \n3 2 14689 6 \n4 2 14689 6 ",
+ "text/html": "\n\n
\n \n \n | \n time | \n user_id | \n creative_id | \n click_times | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n
\n \n \n \n 0 | \n 20 | \n 3131989 | \n 645764 | \n 1 | \n 573314 | \n 58 | \n 2 | \n 14689 | \n 6 | \n
\n \n 1 | \n 10 | \n 3142948 | \n 645764 | \n 1 | \n 573314 | \n 58 | \n 2 | \n 14689 | \n 6 | \n
\n \n 2 | \n 14 | \n 3170643 | \n 645764 | \n 1 | \n 573314 | \n 58 | \n 2 | \n 14689 | \n 6 | \n
\n \n 3 | \n 10 | \n 3194257 | \n 645764 | \n 1 | \n 573314 | \n 58 | \n 2 | \n 14689 | \n 6 | \n
\n \n 4 | \n 21 | \n 3222925 | \n 645764 | \n 1 | \n 573314 | \n 58 | \n 2 | \n 14689 | \n 6 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 151
+ }
+ ],
+ "source": [
+ "# test_clicklog_ad = pd.merge(test_click_log, test_ad, on='creative_id', how='left')\n",
+ "# test_click_ad_dropna = test_clicklog_ad.dropna()\n",
+ "test_clicklog_ad = pd.merge(test_click_log, test_ad, on='creative_id', how='inner')\n",
+ "test_clicklog_ad.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 152,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 类型转换为int\n",
+ "# test_clicklog_ad_drop_userid.ad_id = test_clicklog_ad_drop_userid.product_id.astype(int)\n",
+ "# test_clicklog_ad_drop_userid.product_id = test_clicklog_ad_drop_userid.product_id.astype(int)\n",
+ "# test_clicklog_ad_drop_userid.product_category = test_clicklog_ad_drop_userid.product_id.astype(int)\n",
+ "# test_clicklog_ad_drop_userid.advertiser_id = test_clicklog_ad_drop_userid.product_id.astype(int)\n",
+ "# test_clicklog_ad_drop_userid.industry = test_clicklog_ad_drop_userid.product_id.astype(int)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 153,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "\nInt64Index: 32310439 entries, 0 to 32310438\nData columns (total 9 columns):\ntime int64\nuser_id int64\ncreative_id int64\nclick_times int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\ndtypes: int64(9)\nmemory usage: 2.4 GB\n"
+ }
+ ],
+ "source": [
+ "test_clicklog_ad.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 154,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "\nInt64Index: 32310439 entries, 0 to 32310438\nData columns (total 9 columns):\ntime int64\nuser_id int64\ncreative_id int64\nclick_times int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\ndtypes: int64(9)\nmemory usage: 2.4 GB\n"
+ }
+ ],
+ "source": [
+ "test_clicklog_ad.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 155,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "test_clicklog_ad.to_csv('data/test/clicklog_ad.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 141,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 删除重复user_id\n",
+ "# test_clicklog_ad_drop_userid = test_click_ad_dropna.drop_duplicates('user_id')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# test_clicklog_ad_drop_userid.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# test_clicklog_ad_drop_userid.to_csv('data/test/clicklog_ad.csv', index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": "\nInt64Index: 30082771 entries, 0 to 30082770\nData columns (total 11 columns):\ntime int64\nuser_id int64\ncreative_id int64\nclick_times int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\nage int64\ngender int64\ndtypes: int64(11)\nmemory usage: 2.7 GB\n"
+ }
+ ],
+ "source": [
+ "train_clicklog_ad_user.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 173,
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 9 30920 567330 1 504423 30673 \n1 65 30920 3072255 1 2642300 1261 \n2 56 30920 2361327 1 2035918 1261 \n3 6 309204 325532 1 292523 27081 \n4 59 309204 2746730 1 2362208 0 \n\n product_category advertiser_id industry age gender \n0 3 32638 319 2 1 \n1 2 6783 6 2 1 \n2 2 6783 6 2 1 \n3 3 32066 242 6 1 \n4 18 14682 88 6 1 ",
+ "text/html": "\n\n
\n \n \n | \n time | \n user_id | \n creative_id | \n click_times | \n ad_id | \n product_id | \n product_category | \n advertiser_id | \n industry | \n age | \n gender | \n
\n \n \n \n 0 | \n 9 | \n 30920 | \n 567330 | \n 1 | \n 504423 | \n 30673 | \n 3 | \n 32638 | \n 319 | \n 2 | \n 1 | \n
\n \n 1 | \n 65 | \n 30920 | \n 3072255 | \n 1 | \n 2642300 | \n 1261 | \n 2 | \n 6783 | \n 6 | \n 2 | \n 1 | \n
\n \n 2 | \n 56 | \n 30920 | \n 2361327 | \n 1 | \n 2035918 | \n 1261 | \n 2 | \n 6783 | \n 6 | \n 2 | \n 1 | \n
\n \n 3 | \n 6 | \n 309204 | \n 325532 | \n 1 | \n 292523 | \n 27081 | \n 3 | \n 32066 | \n 242 | \n 6 | \n 1 | \n
\n \n 4 | \n 59 | \n 309204 | \n 2746730 | \n 1 | \n 2362208 | \n 0 | \n 18 | \n 14682 | \n 88 | \n 6 | \n 1 | \n
\n \n
\n
"
+ },
+ "metadata": {},
+ "execution_count": 173
+ }
+ ],
+ "source": [
+ "train_clicklog_ad_user.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.5-final"
+ },
+ "orig_nbformat": 2,
+ "kernelspec": {
+ "name": "python36564bit792083a9d155497086f5b8bc917c01d5",
+ "display_name": "Python 3.6.5 64-bit"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/word2vec_creative_id.py b/word2vec_creative_id.py
index e9acc22..e5e8450 100644
--- a/word2vec_creative_id.py
+++ b/word2vec_creative_id.py
@@ -9,7 +9,7 @@
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts, get_tmpfile
import pickle
-from mail import mail
+from mymail import mail
# %%
df_train = pd.read_csv(
'data/train_preliminary/clicklog_ad_user_train_eval_test.csv')