From faf94fc54472bbb7b80afb595939d6560087fa5e Mon Sep 17 00:00:00 2001 From: sunlanchang Date: Fri, 15 May 2020 21:36:10 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- LightGBM_experiment.py | 284 --------------------------------- README.md | 4 +- get_user_embed_multiprocess.py | 74 --------- yrq_word2vec.py | 130 --------------- 4 files changed, 3 insertions(+), 489 deletions(-) delete mode 100644 LightGBM_experiment.py delete mode 100644 get_user_embed_multiprocess.py delete mode 100644 yrq_word2vec.py diff --git a/LightGBM_experiment.py b/LightGBM_experiment.py deleted file mode 100644 index 77fe850..0000000 --- a/LightGBM_experiment.py +++ /dev/null @@ -1,284 +0,0 @@ -# %% -import lightgbm as lgb -import pandas as pd -import numpy as np -import time -from sklearn.metrics import accuracy_score -# %% -print('Loading all data...') -start = time.time() -all_train_data = pd.read_csv('data/train_preliminary/clicklog_ad_user.csv') -df_test = pd.read_csv('data/test/clicklog_ad.csv') -print('Split data into train and validation...') -TRAIN_DATA_PERCENT = 0.9 -msk = np.random.rand(len(all_train_data)) < TRAIN_DATA_PERCENT -df_train = all_train_data[msk] -df_val = all_train_data[~msk] -feature_columns = df_train.columns.values.tolist() - -feature_columns = [ - 'time', - 'user_id', - 'creative_id', - 'click_times', - # 'ad_id', - # 'product_id', - # 'product_category', - # 'advertiser_id', - # 'industry', -] - -# feature_columns.remove('age') -# feature_columns.remove('gender') -label_age, label_gender = ['age'], ['gender'] - -X_train = df_train[feature_columns] -y_train_gender = df_train[label_gender] -# set label 0 and 1 -y_train_gender.gender = y_train_gender.gender-1 - -y_train_age = df_train[label_age] -y_train_age.age = y_train_age.age-1 - -X_val = df_val[feature_columns] -y_val_gender = df_val[label_gender] -y_val_gender.gender = y_val_gender.gender-1 - -y_val_age = df_val[label_age] -y_val_age.age = y_val_age.age-1 - - -X_test = df_test[feature_columns] - -print('Loading data uses {:.1f}s'.format(time.time()-start)) -categorical_feature = [ - # 'industry', - # 'advertiser_id', - # 'product_category', - # 'product_id', - # 'ad_id', - 'creative_id', - 'user_id', -] -# 构建性别数据 -lgb_train_gender = lgb.Dataset( - X_train, y_train_gender, feature_name=feature_columns, categorical_feature=categorical_feature) -lgb_eval_gender = lgb.Dataset( - X_val, y_val_gender, reference=lgb_train_gender, feature_name=feature_columns, categorical_feature=categorical_feature) -# 构建年龄数据 -lgb_train_age = lgb.Dataset( - X_train, y_train_age, feature_name=feature_columns, categorical_feature=categorical_feature) -lgb_eval_age = lgb.Dataset( - X_val, y_val_age, reference=lgb_train_age, feature_name=feature_columns, categorical_feature=categorical_feature) -# %% -# write to hdf5 to read fast -X_train.to_hdf('data/clicklog_ad_user.h5', key='X_train', mode='w') -y_train_gender.to_hdf('data/clicklog_ad_user.h5', - key='y_train_gender', mode='a') -y_train_age.to_hdf('data/clicklog_ad_user.h5', key='y_train_age', mode='a') -X_val.to_hdf('data/clicklog_ad_user.h5', key='X_val', mode='a') -y_val_gender.to_hdf('data/clicklog_ad_user.h5', key='y_val_gender', mode='a') -y_val_age.to_hdf('data/clicklog_ad_user.h5', key='y_val_age', mode='a') -X_test.to_hdf('data/clicklog_ad_user.h5', key='X_test', mode='a') - - -# %% -# read from hdf5 -X_train = pd.read_hdf('data/clicklog_ad_user.h5', key='X_train', mode='r') -y_train_gender = pd.read_hdf('data/clicklog_ad_user.h5', - key='y_train_gender', mode='r') -y_train_age = pd.read_hdf('data/clicklog_ad_user.h5', - key='y_train_age', mode='r') -X_val = pd.read_hdf('data/clicklog_ad_user.h5', key='X_val', mode='r') -y_val_gender = pd.read_hdf('data/clicklog_ad_user.h5', - key='y_val_gender', mode='r') -y_val_age = pd.read_hdf('data/clicklog_ad_user.h5', key='y_val_age', mode='r') -X_test = pd.read_hdf('data/clicklog_ad_user.h5', key='X_test', mode='r') - -# %% - - -def LGBM_gender(): - params_gender = { - 'task': 'train', - 'boosting_type': 'gbdt', - 'objective': 'binary', - 'metric': {'binary_logloss', 'binary_error'}, # evaluate指标 - 'max_depth': -1, # 不限制树深度 - # 更高的accuracy - 'max_bin': 2**10-1, - - 'num_leaves': 2**10, - 'min_data_in_leaf': 1, - 'learning_rate': 0.01, - # 'feature_fraction': 0.9, - # 'bagging_fraction': 0.8, - # 'bagging_freq': 5, - # 'is_provide_training_metric': True, - 'verbose': 1 - } - print('Start training...') - # train - gbm = lgb.train(params_gender, - lgb_train_gender, - num_boost_round=20, - valid_sets=lgb_eval_gender, - # early_stopping_rounds=5, - ) - print('training done!') - print('Saving model...') - # save model to file - gbm.save_model('tmp/model_gender.txt') - print('save model done!') - return gbm - - -# %% -def LGBM_age(): - params_age = { - 'boosting_type': 'gbdt', - 'objective': 'multiclass', - "num_class": 10, - # fine-tuning最重要的三个参数 - 'num_leaves': 2**10-1, - 'max_depth': -1, # 不限制树深度 - 'min_data_in_leaf': 1, - # 更高的accuracy - # 'max_bin': 2**9-1, - - 'metric': {'multi_logloss', 'multi_error'}, - 'learning_rate': 0.1, - - # 'feature_fraction': 0.9, - # 'bagging_fraction': 0.8, - # 'bagging_freq': 5, - 'verbose': 1 - } - print('Start training...') - # train - gbm = lgb.train(params_age, - lgb_train_age, - num_boost_round=20, - valid_sets=lgb_eval_age, - # early_stopping_rounds=5, - ) - print('Saving model...') - # save model to file - gbm.save_model('tmp/model_age.txt') - print('save model done!') - return gbm - - -# %% -# gbm_gender = LGBM_gender() -# gbm_age = LGBM_age() -gbm_gender = lgb.Booster(model_file='tmp/model_gender.txt') -gbm_age = lgb.Booster(model_file='tmp/model_age.txt') - - -# %% -def evaluate(): - print('Start predicting...') - y_pred_gender_probability = gbm_gender.predict( - X_val, num_iteration=gbm_gender.best_iteration) - threshold = 0.5 - y_pred_gender = np.where(y_pred_gender_probability > threshold, 1, 0) - # eval - print('threshold: {:.1f} The accuracy of prediction is:{:.2f}'.format(threshold, - # accuracy_score(y_val_gender, y_pred_gender))) - print('Start evaluate data predicting...') - y_pred_age_probability=gbm_age.predict( - X_val, num_iteration=gbm_age.best_iteration) - y_pred_age=np.argmax(y_pred_age_probability, axis=1) - # eval - print('The accuracy of prediction is:{:.2f}'.format( - accuracy_score(y_val_age, y_pred_age))) - - d={'user_id': X_val.user_id.values.tolist(), 'gender': y_pred_gender.tolist(), - 'age': y_pred_age.tolist()} - ans_df=pd.DataFrame(data=d) - # 投票的方式决定gender、age - ans_df_grouped=ans_df.groupby(['user_id']).agg( - lambda x: x.value_counts().index[0]) - ans_df_grouped.gender=ans_df_grouped.gender+1 - ans_df_grouped.age=ans_df_grouped.age+1 - ans_df_grouped.to_csv('data/ans_eval.csv', header=True) - -def evaluate_generate_train_label(): - print('Start predicting...') - y_pred_gender_probability=gbm_gender.predict( - X_val, num_iteration=gbm_gender.best_iteration) - threshold=0.5 - y_pred_gender=np.where(y_pred_gender_probability > threshold, 1, 0) - # eval - print('threshold: {:.1f} The accuracy of prediction is:{:.2f}'.format(threshold, - # accuracy_score(y_val_gender, y_pred_gender))) - print('Start evaluate data predicting...') - y_pred_age_probability=gbm_age.predict( - X_val, num_iteration=gbm_age.best_iteration) - y_pred_age=np.argmax(y_pred_age_probability, axis=1) - # eval - print('The accuracy of prediction is:{:.2f}'.format( - accuracy_score(y_val_age, y_pred_age))) - - d={'user_id': X_val.user_id.values.tolist(), 'gender': y_pred_gender.tolist(), - 'age': y_pred_age.tolist()} - ans_df=pd.DataFrame(data=d) - # 投票的方式决定gender、age - ans_df_grouped=ans_df.groupby(['user_id']).agg( - lambda x: x.value_counts().index[0]) - ans_df_grouped.gender=ans_df_grouped.gender+1 - ans_df_grouped.age=ans_df_grouped.age+1 - ans_df_grouped.to_csv('data/ans_eval.csv', header=True) - - -# %% -evaluate() -# %% - - -def test(): - print('Start predicting test gender data ...') - y_pred_gender_probability=gbm_gender.predict( - X_test, num_iteration=gbm_gender.best_iteration) - threshold=0.5 - y_pred_gender=np.where(y_pred_gender_probability > threshold, 1, 0) - - print('Start predicting test age data ...') - y_pred_age_probability=gbm_age.predict( - X_test, num_iteration=gbm_age.best_iteration) - y_pred_age=np.argmax(y_pred_age_probability, axis=1) - - print('start voting...') - d={'user_id': X_test.user_id.values.tolist(), - 'predicted_age': y_pred_age.tolist(), - 'predicted_gender': y_pred_gender.tolist(), - } - ans_df=pd.DataFrame(data=d) - # 投票的方式决定gender、age - ans_df_grouped=ans_df.groupby(['user_id']).agg( - lambda x: x.value_counts().index[0]) - ans_df_grouped['user_id']=ans_df_grouped.index - ans_df_grouped.gender=ans_df_grouped.gender+1 - ans_df_grouped.age=ans_df_grouped.age+1 - columns_order=['user_id', 'predicted_age', 'predicted_gender'] - ans_df_grouped[columns_order].to_csv( - 'data/ans_test.csv', header=True, columns=['user_id', 'predicted_age', 'predicted_gender'], index=False) - print('Done!!!') - - -test() -# %% -# for leaves in range(10, 13): -# gbm_age = LGBM_age(leaves) -# y_pred_probability = gbm_age.predict( -# X_val, num_iteration=gbm_age.best_iteration) -# y_pred = np.argmax(y_pred_probability, axis=1) -# print('v'*20) -# print('leaves: ', leaves) -# print('The accuracy of prediction is:{:.2f}'.format( -# accuracy_score(y_val_age, y_pred))) -# print('v'*20) - - -# %% diff --git a/README.md b/README.md index aac70ed..1e44870 100644 --- a/README.md +++ b/README.md @@ -56,8 +56,10 @@ NLP中常用的做法,将用户点击序列中的creative_id或者ad_id集合 ├── README.md ├── img ├── data # 训练和测试数据 +├── word2vec # 保存word2vec生成的向量 +├── word2vec.py # 生成用户的embedding vector ├── process_data.ipynb # 将训练集ad.csv、user.csv合并到click_log.csv,测试集中的ad.csv合并到click_log.csv -└── tmp +└── tmp # 临时文件 ``` # 数据探索 diff --git a/get_user_embed_multiprocess.py b/get_user_embed_multiprocess.py deleted file mode 100644 index 81fa407..0000000 --- a/get_user_embed_multiprocess.py +++ /dev/null @@ -1,74 +0,0 @@ -from multiprocessing import Process -import multiprocessing -import pandas as pd -# 子进程要执行的代码 -from multiprocessing import Pool, cpu_count -import os -import time -import tqdm -import pickle -# print(multiprocessing.cpu_count()) - - -def long_time_task(i, start, end): - pid = os.getpid() - columns = ['c'+str(i) for i in range(128)] - data = {} - for col_name in columns: - data[col_name] = pd.Series([], dtype='float') - df_user_embedding = pd.DataFrame(data) - - for idx in range(start, end): - user_emb = df_creativeid_embedding.loc[seq_creative_id[idx]].mean() - df_user_embedding = df_user_embedding.append( - user_emb, ignore_index=True) - - if idx != start and (idx-start) % 5000 == 0: - print('进程{}: {}/{}'.format(pid, idx-start, end-start)) - if idx != start and (idx-start) % 50000 == 0: - pass - # break - df_user_embedding.to_hdf( - '/tmp/df_user_embedding{}.h5'.format(i), key='df_user_embedding{}'.format(i), mode='w') - - -if __name__ == '__main__': - df_creativeid_embedding = pd.read_hdf( - 'word2vec/df_creativeid_embedding.h5', - key='df_creativeid_embedding', mode='r') - - # with open('word2vec/userid_creativeids.txt', 'r')as f: - # seq_creative_id = f.readlines() - # seq_creative_id = [[str(e) for e in line.strip().split(' ')] - # for line in seq_creative_id] - - # with open('word2vec/seq_creative_id.pkl', 'wb') as f: - # pickle.dump(seq_creative_id, f) - # print('pickle done.') - with open('word2vec/seq_creative_id.pkl', 'rb') as f: - print('start reading...') - seq_creative_id = pickle.load(f) - print('read pickle done.') - - print('当前母进程: {}'.format(os.getpid())) - p = Pool(os.cpu_count()) - - my_cpu_count = os.cpu_count() - num_user = 1900000 - unit = num_user//my_cpu_count - indexes = [] - for idx in range(my_cpu_count): - indexes.append((unit*idx, unit*(idx+1))) - if unit*(idx+1) != num_user: - indexes.append((unit*(idx+1), num_user)) - import time - time_start = time.time() - for i, (start, end) in enumerate(indexes): - p.apply_async(long_time_task, args=(i, start, end)) - p.close() - p.join() - print('等待所有子进程完成。') - print('共使用 {:.2f} min.'.format((time.time()-time_start)/60)) - - # for user in tqdm.tqdm(range(len(seq_creative_id))): - # user_em = df_creativeid_embedding.loc[seq_creative_id[user]].mean() diff --git a/yrq_word2vec.py b/yrq_word2vec.py deleted file mode 100644 index 6c88b53..0000000 --- a/yrq_word2vec.py +++ /dev/null @@ -1,130 +0,0 @@ -# %% -import numpy as np -import pandas as pd -from gensim.test.utils import datapath -from gensim.models.word2vec import LineSentence -from gensim.models import Word2Vec -from gensim.models import KeyedVectors -from gensim.test.utils import common_texts, get_tmpfile -import re -from tqdm import tqdm - -import sys -import time - -# Data_Root -# raw -train_raw_data_root = 'data/train_preliminary' -test_raw_data_root = 'data/test' - -# Env-CSV_Data -# train -train_ad_filepath = train_raw_data_root + '/ad.csv' -train_click_log_filepath = train_raw_data_root + '/click_log.csv' -train_user_filepath = train_raw_data_root + '/user.csv' -# test -test_ad_filepath = test_raw_data_root + '/ad.csv' -test_click_log_filepath = test_raw_data_root + '/click_log.csv' - -# word2vec -word2vec_dict_filepath = 'word2vec/dict.txt' -word2vec_word2vec_model_filepath = 'word2vec/word2vec.model' -word2vec_wordvectors_kv_filepath = 'word2vec/wordvectors.kv' -data_vec_filepath = 'word2vec/data_vec.csv' - - -def data(): - train_ad = pd.read_csv(train_ad_filepath) - print('train_ad Read Done') - train_click_log = pd.read_csv(train_click_log_filepath) - print('train_click_log Read Done') - train_user = pd.read_csv(train_user_filepath) - print('train_user Read Done') - - test_ad = pd.read_csv(test_ad_filepath) - print('test_ad Read Done') - test_click_log = pd.read_csv(test_click_log_filepath) - print('test_click_log Read Done') - print('\nData Read Done\n') - return train_ad, train_click_log, train_user, test_ad, test_click_log - - -# %% -train_ad, train_click_log, train_user, test_ad, test_click_log = data() - -click_log = train_click_log.append(test_click_log) -ad = train_ad.append(test_ad) -data = pd.merge(click_log, ad, on='creative_id', how='left').fillna( - int(-1)).replace('\\N', int(-1)).astype(int) -data_creativeid = data.groupby("user_id")['creative_id'].apply( - list).reset_index(name='creative_id') -# data_product_category = data.groupby("user_id")['product_category'].apply( -# list).reset_index(name='product_category') -# dict = pd.merge(data_industry, data_product_category, -# on='user_id', how='inner') -# product_category_tmp = [] -# with tqdm(total=int(len(data_creativeid))) as pbar: -# for j in dict["product_category"]: -# product_category_tmp.append([i+400 for i in j]) -# pbar.update(1) -# tmp = pd.Series(product_category_tmp) - -# dict["product_category_industry"] = tmp + \ -# dict["industry"] # .map(str) product_category+400 - -#dict.drop(labels='user_id',axis=1).to_csv(word2vec_dict_filepath, index=False, header=False) -# with open(word2vec_dict_filepath, 'w')as f: -# with tqdm(total=int(len(dict['product_category_industry']))) as pbar: -# for i in dict['product_category_industry']: -# i = [str(e) for e in i] -# line = ' '.join(i) -# f.write(line+'\n') -# pbar.update(1) -# sentences = LineSentence(word2vec_dict_filepath) -# dimension_embedding = 32 -# model = Word2Vec(sentences, size=dimension_embedding, -# window=3, min_count=1, workers=-1) -# model.save(word2vec_word2vec_model_filepath) -# model.wv.save(word2vec_wordvectors_kv_filepath) -# %% -word2vec_wordvectors_kv_filepath = 'word2vec/wordvectors.kv' -wv = KeyedVectors.load(word2vec_wordvectors_kv_filepath, mmap='r') - -dict_embd_creativeid = {} -with tqdm(total=int(len(wv.vocab))) as pbar: - for key in wv.vocab: - dict_embd_creativeid[key] = wv[key].tolist() - pbar.update(1) -# %% -creative_id_tmp = pd.Series( - list(dict_embd_creativeid.keys())).astype(int) -# %% -vec_tmp = pd.Series(list(dict_embd_creativeid.values())) -embd_creativeid_pd = pd.DataFrame( - columns=['creative_id', 'vec']) -embd_creativeid_pd['creative_id'] = creative_id_tmp -embd_creativeid_pd['vec'] = vec_tmp -# %% -data_vec_creative_id_tmp = pd.DataFrame(data, columns=['user_id', 'product_category']).rename( - {'product_category': 'creative_id'}, axis='columns') -data_vec_industry_tmp = pd.DataFrame(data, columns=['user_id', 'industry']).rename( - {'industry': 'creative_id'}, axis='columns') -data_vec_creative_id_tmp['creative_id'] = data_vec_creative_id_tmp['creative_id']+400 - -data_vec_product_category = pd.merge( - data_vec_creative_id_tmp, embd_creativeid_pd, on='creative_id', how='left') -data_vec_industry = pd.merge( - data_vec_industry_tmp, embd_creativeid_pd, on='creative_id', how='left') - -data_vec_tmp = pd.concat( - [data_vec_product_category, data_vec_industry], names=['user_id', 'vec']) - -data_vec_list_tmp_pd = pd.DataFrame(pd.DataFrame( - data_vec_tmp, columns=['user_id', 'vec'])['vec'].tolist()) -data_user_id_list_tmp_pd = pd.DataFrame(pd.DataFrame( - data_vec_tmp, columns=['user_id', 'vec'])['user_id'].tolist()) -data_vec_tmp = pd.merge(data_user_id_list_tmp_pd, data_vec_list_tmp_pd, left_index=True, - right_index=True).rename({'0_x': 'user_id', '0_y': '0'}, axis='columns') -data_vec = data_vec_tmp.groupby("user_id").mean() - -data_vec.to_csv(data_vec_filepath, header=False, index=True)