From 632485850a66b347aecd06bcd812568c212aa892 Mon Sep 17 00:00:00 2001 From: sunlanchang Date: Fri, 8 May 2020 13:23:58 +0800 Subject: [PATCH] update code --- process_data.ipynb | 204 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 181 insertions(+), 23 deletions(-) diff --git a/process_data.ipynb b/process_data.ipynb index 928215a..d4eca17 100644 --- a/process_data.ipynb +++ b/process_data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -205,24 +205,6 @@ "train_user.dtypes" ] }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": "(2481135,)" - }, - "metadata": {}, - "execution_count": 15 - } - ], - "source": [ - "train_ad.creative_id.unique().shape" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -313,13 +295,189 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "train_tmp_click_log = 'data/train_tmp/click_log.csv'\n", - "# train_click_log_path = 'data/train_preliminary/click_log.csv'\n", - "train_click_log = pd.read_csv(train_click_log_path)" + "# train_tmp_click_log = 'data/train_tmp/click_log.csv'\n", + "test_click_log_path = 'data/test/click_log.csv'\n", + "test_click_log = pd.read_csv(test_click_log_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "((91,), (1000000,), (2618159,), (93,))" + }, + "metadata": {}, + "execution_count": 15 + } + ], + "source": [ + "test_click_log.time.unique().shape, test_click_log.user_id.unique().shape, test_click_log.creative_id.unique().shape, test_click_log.click_times.unique().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "time int64\nuser_id int64\ncreative_id int64\nclick_times int64\ndtype: object" + }, + "metadata": {}, + "execution_count": 16 + } + ], + "source": [ + "test_click_log.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "test_ad = pd.read_csv('data/train_preliminary/ad.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "creative_id int64\nad_id int64\nproduct_id object\nproduct_category int64\nadvertiser_id int64\nindustry object\ndtype: object" + }, + "metadata": {}, + "execution_count": 18 + } + ], + "source": [ + "test_ad.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "test_ad.product_id[test_ad.product_id=='\\\\N']='0'\n", + "test_ad.industry[test_ad.industry=='\\\\N']='0'" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "test_ad.product_id = test_ad.product_id.astype(int)\n", + "test_ad.industry = test_ad.industry.astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "creative_id int64\nad_id int64\nproduct_id int64\nproduct_category int64\nadvertiser_id int64\nindustry int64\ndtype: object" + }, + "metadata": {}, + "execution_count": 21 + } + ], + "source": [ + "test_ad.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " creative_id ad_id product_id product_category advertiser_id industry\n0 1 1 0 5 381 78\n1 4 4 0 5 108 202\n2 7 7 0 5 148 297\n3 8 8 0 5 713 213\n4 9 9 0 5 695 213", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
creative_idad_idproduct_idproduct_categoryadvertiser_idindustry
0110538178
14405108202
27705148297
38805713213
49905695213
\n
" + }, + "metadata": {}, + "execution_count": 23 + } + ], + "source": [ + "test_ad.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " time user_id creative_id click_times ad_id product_id \\\n0 20 3131989 645764 1 573314.0 58.0 \n1 20 3131989 1027422 1 902764.0 129.0 \n2 20 3131989 1106443 1 970829.0 2171.0 \n3 20 3131989 629802 1 559183.0 0.0 \n4 59 3131989 2839769 1 2441288.0 129.0 \n\n product_category advertiser_id industry \n0 2.0 14689.0 6.0 \n1 2.0 42272.0 6.0 \n2 2.0 37513.0 322.0 \n3 18.0 14678.0 26.0 \n4 2.0 35328.0 6.0 ", + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
timeuser_idcreative_idclick_timesad_idproduct_idproduct_categoryadvertiser_idindustry
02031319896457641573314.058.02.014689.06.0
120313198910274221902764.0129.02.042272.06.0
220313198911064431970829.02171.02.037513.0322.0
32031319896298021559183.00.018.014678.026.0
4593131989283976912441288.0129.02.035328.06.0
\n
" + }, + "metadata": {}, + "execution_count": 22 + } + ], + "source": [ + "test_clicklog_ad = pd.merge(test_click_log, test_ad, on='creative_id', how='left')\n", + "test_clicklog_ad.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(2618159,)" + }, + "metadata": {}, + "execution_count": 30 + } + ], + "source": [ + "test_click_log.creative_id.unique().shape" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(2481135,)" + }, + "metadata": {}, + "execution_count": 31 + } + ], + "source": [ + "test_ad.creative_id.unique().shape" ] } ],