From ff53f9f9e83e12cb2270fc9528668328a3818c85 Mon Sep 17 00:00:00 2001 From: sunlanchang Date: Tue, 19 May 2020 22:34:46 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0tf-idf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ..._embedding.py => Dense_embedding_16_dim.py | 0 Dense_embedding_96_dim.py | 259 +++++++++++++++ LightGBM_embedding_128_dim.py | 301 ++++++++++++++++++ ...bedding.py => LightGBM_embedding_16_dim.py | 0 test.py | 127 +++++++- tf_idf.py | 127 ++++++++ 6 files changed, 808 insertions(+), 6 deletions(-) rename Dense_embedding.py => Dense_embedding_16_dim.py (100%) create mode 100644 Dense_embedding_96_dim.py create mode 100644 LightGBM_embedding_128_dim.py rename LightGBM_embedding.py => LightGBM_embedding_16_dim.py (100%) create mode 100644 tf_idf.py diff --git a/Dense_embedding.py b/Dense_embedding_16_dim.py similarity index 100% rename from Dense_embedding.py rename to Dense_embedding_16_dim.py diff --git a/Dense_embedding_96_dim.py b/Dense_embedding_96_dim.py new file mode 100644 index 0000000..757eee6 --- /dev/null +++ b/Dense_embedding_96_dim.py @@ -0,0 +1,259 @@ +# %% +from tensorflow.keras.layers import Dense +from tensorflow.keras.wrappers.scikit_learn import KerasClassifier +from sklearn.model_selection import cross_val_score +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import StratifiedKFold +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +import pandas as pd +import numpy as np +import tensorflow as tf +from tensorflow import keras +from keras.utils import to_categorical +# import keras +from tensorflow.keras import layers +from tensorflow.keras.utils import multi_gpu_model +import gc +# %% +vec_dirs = ['ad_id', 'advertiser_id', 'creative_id', + 'industry', 'product_category'] +samples = 10000 +user_train = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) +train_data = pd.DataFrame({'user_id': user_train.user_id}) + +user_id_test = pd.read_csv( + 'data/test/clicklog_ad_user_test.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique() +test_data = pd.DataFrame({'user_id': user_id_test}) + +for i, vec_dir in enumerate(vec_dirs): + columns = [str(n) for n in range(i*16, (i+1)*16)] + train = pd.read_csv('word2vec/'+vec_dir+'data_vec.csv', + nrows=900000, skiprows=None, names=columns) + test = pd.read_csv('word2vec/'+vec_dir+'data_vec.csv', + names=columns, + skiprows=900001) + train_data = pd.merge(train_data, train, on='user_id', how='inner') + test_data = pd.merge(test_data, test, on='user_id', how='inner') + +X_train = train_data[columns].values +del train_data +gc.collect() +X_test = test_data[columns].values +del test_data +gc.collect() + +Y_gender = user_train['gender'].values +Y_age = user_train['age'].values + +user_id_test = pd.read_csv( + 'data/test/clicklog_ad_user_test.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique() +ans = pd.DataFrame({'user_id': user_id_test}) +# %% + + +def create_gender_model(): + model = keras.Sequential( + [ + keras.Input(shape=(128,)), + layers.Dense(256, activation="elu"), + layers.Dense(512, activation="elu"), + # layers.Dense(1024, activation="elu"), + # layers.Dense(512, activation="elu"), + layers.Dense(256, activation="elu"), + layers.Dense(128, activation='elu'), + # layers.Dense(2, activation='softmax', name='classifier') + Dense(1, activation='sigmoid') + ] + ) + model.compile(loss='binary_crossentropy', + optimizer='Adam', metrics=['accuracy']) + # model.summary() + return model + +# %% + + +def create_age_model(): + model = keras.Sequential( + [ + keras.Input(shape=(128,)), + layers.Dense(256, activation="elu"), + layers.Dense(512, activation="elu"), + # layers.Dense(1024, activation="elu"), + # layers.Dense(512, activation="elu"), + layers.Dense(256, activation="elu"), + layers.Dense(128, activation='elu'), + Dense(10, activation='softmax') + ] + ) + model.compile(loss='categorical_crossentropy', + optimizer='Adam', metrics=['accuracy']) + # model.summary() + return model + +# %% + + +def train_gender(X, Y, X_test, train=True, epoch=10, batch_size=1024): + # 类别转换为0和1 + encoder = LabelEncoder() + encoder.fit(Y) + Y_encoded = encoder.transform(Y) + if train: + scaler = StandardScaler() + scaler.fit(X) + X = scaler.transform(X) + model = create_gender_model() + model.fit(X, Y_encoded, batch_size=batch_size, epochs=epoch) + + X_test = scaler.transform(X_test) + y_pre = model.predict(X_test) + threshold = 0.5 + y_pred_gender = np.where(y_pre > threshold, 1, 0) + return y_pred_gender + else: + estimators = [] + estimators.append(('standardize', StandardScaler())) + estimators.append(('mlp', KerasClassifier( + build_fn=create_gender_model, epochs=epoch, batch_size=batch_size, verbose=0))) + pipeline = Pipeline(estimators) + kfold = StratifiedKFold(n_splits=5, shuffle=True) + results = cross_val_score(pipeline, X, Y_encoded, cv=kfold) + print("Baseline: %.2f%% (%.2f%%)" % + (results.mean()*100, results.std()*100)) + + # parallel_model = multi_gpu_model(model, gpus=2) + # parallel_model = model + # parallel_model.fit(X, Y, epochs=10, batch_size=batch_size) + # gender_pred = parallel_model.predict(X_test, batch_size=batch_size) + # return gender_pred +# %% + + +def train_age(X, Y, X_test, train=True, epoch=10, batch_size=1024): + # 类别转换为0和1 + encoder = LabelEncoder() + encoder.fit(Y) + Y = encoder.transform(Y) + if train: + scaler = StandardScaler() + scaler.fit(X) + X = scaler.transform(X) + Y = to_categorical(Y) + model = create_age_model() + model.fit(X, Y, batch_size=batch_size, epochs=epoch) + + X_test = scaler.transform(X_test) + y_pre = model.predict(X_test) + y_pred_age = np.argmax(y_pre, axis=1) + + return y_pred_age + else: + # estimator = KerasClassifier( + # build_fn=create_gender_model, epochs=epoch, batch_size=batch_size, verbose=0) + estimators = [] + estimators.append(('standardize', StandardScaler())) + estimators.append(('mlp', KerasClassifier( + build_fn=create_age_model, epochs=epoch, batch_size=batch_size, verbose=0))) + pipeline = Pipeline(estimators) + kfold = StratifiedKFold(n_splits=10, shuffle=True) + results = cross_val_score(pipeline, X, Y, cv=kfold) + print("Baseline: %.2f%% (%.2f%%)" % + (results.mean()*100, results.std()*100)) + + +# %% +y_gender = train_gender(X_train, Y_gender, X_test, + train=False, epoch=50, batch_size=4096) +y_age = train_age(X_train, Y_age, X_test, + train=False, epoch=50, batch_size=4096) +# %% +ans['predicted_age'] = y_age+1 +ans['predicted_gender'] = y_gender+1 +# %% +ans.to_csv('data/ans/word2vec.csv', + columns=['user_id', 'predicted_age', 'predicted_gender'], + header=True, + index=False, + ) + +# %% +# %% +# df_train = df_train.sort_values( +# ["user_id"], ascending=(True,)) + +# # %% + + +# def get_batch(file_name,): +# for row in open(file_name, "r"): +# yield 1 + + +# for line in get_batch('data/train_data.csv'): +# for line in get_batch('test.py'): +# print(line) +# break +# %% +# 合成用户embedding +# path = "word2vec/wordvectors.kv" +# wv = KeyedVectors.load(path, mmap='r') +# with open('word2vec/userid_creativeids.txt', 'r')as f: +# lines = f.readlines() +# lines = [[int(e) for e in line.split(' ')] for line in lines] +# number_train_user = 900000 +# number_test_user = 1000000 +# user_train = lines[:number_train_user] +# user_test = lines[number_train_user:] +# columns = ['c'+str(i) for i in range(128)] +# data = {} +# for col_name in columns: +# data[col_name] = pd.Series([], dtype='float') +# df_user_train = pd.DataFrame(data) +# df_user_test = pd.DataFrame(data) +# # %% +# for line in tqdm.tqdm(user_train): +# user_embedding_train = np.zeros(128) +# for creative_id in line: +# user_embedding_train += wv[str(creative_id)] +# user_embedding_train = user_embedding_train / len(line) +# tmp = pd.DataFrame(user_embedding_train.reshape(-1, +# len(user_embedding_train)), columns=columns) +# df_user_train = df_user_train.append(tmp) +# # %% +# for line in tqdm.tqdm(user_test): +# user_embedding_test = np.zeros(128) +# for creative_id in line: +# user_embedding_test += wv[str(creative_id)] +# user_embedding_test = user_embedding_test / len(line) +# tmp = pd.DataFrame(user_embedding_test.reshape(-1, +# len(user_embedding_train)), columns=columns) +# df_user_test = df_user_test.append(tmp) +# # %% +# # 将同一个用户creative_id相加平均后即为一个用户的Embedding +# all_train_data = pd.read_csv( +# 'data/train_preliminary/clicklog_ad_user_train_eval_test.csv') +# all_train_data = all_train_data.sort_values( +# ["user_id"], ascending=(True)) +# # %% +# all_test_data = pd.read_csv( +# 'data/test/clicklog_ad_user_test.csv') +# all_test_data = all_test_data.sort_values( +# ["user_id"], ascending=(True)) +# # %% +# assert df_user_train.shape[0] == all_train_data.shape[0] +# df_user_train['user_id'] = all_train_data['user_id'] +# df_user_train['gender'] = all_train_data['gender'] +# df_user_train['age'] = all_train_data['age'] +# df_user_train.to_hdf('word2vec/df_user_train_test.h5', +# key='df_user_train', mode='w') +# # %% +# assert df_user_test.shape[0] == all_test_data.shape[0] +# df_user_test['user_id'] = all_test_data['user_id'] +# df_user_test.to_hdf('word2vec/df_user_train_test.h5', +# key='df_user_test', mode='a') + + +# %% diff --git a/LightGBM_embedding_128_dim.py b/LightGBM_embedding_128_dim.py new file mode 100644 index 0000000..53fcb1a --- /dev/null +++ b/LightGBM_embedding_128_dim.py @@ -0,0 +1,301 @@ +# %% +from tensorflow.keras.layers import Dense +from tensorflow.keras.wrappers.scikit_learn import KerasClassifier +from sklearn.model_selection import cross_val_score +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import StratifiedKFold +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +import lightgbm as lgb +import time +from sklearn.metrics import accuracy_score +import pandas as pd +import numpy as np +import tensorflow as tf +from tensorflow import keras +from keras.utils import to_categorical +from tensorflow.keras import layers +from tensorflow.keras.utils import multi_gpu_model +import gc +# %% +samples = 1000 +all_train_data = pd.read_csv('word2vec/word2vec/train_data_128_dim.csv', + nrows=900000, skiprows=None) +# nrows=samples, skiprows=None).sort_values(['user_id'], ascending=(True,)) +columns = all_train_data.columns.values.tolist() +test_data = pd.read_csv('word2vec/data_vec_product_category_industry_16dimension.csv', + names=columns, + skiprows=900001, + # nrows=samples, + ).sort_values(['user_id'], ascending=(True,)) + +user_train = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) +Y_gender = user_train['gender'].values +Y_age = user_train['age'].values + +all_train_data['gender'] = user_train.gender +all_train_data['age'] = user_train.age + +TRAIN_DATA_PERCENT = 0.9 +mask = np.random.rand(len(all_train_data)) < TRAIN_DATA_PERCENT +df_train = all_train_data[mask] +df_val = all_train_data[~mask] + +X_train = df_train[columns].values +Y_train_gender = df_train.gender.values +Y_train_age = df_train.age.values + +X_val = df_val[columns].values +Y_val_gender = df_val.gender.values +Y_val_age = df_val.age.values +# del train_data +# gc.collect() +X_test = test_data[columns].values +# del test_data +# gc.collect() + +user_id_test = pd.read_csv( + 'data/test/clicklog_ad_user_test.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique() +ans = pd.DataFrame({'user_id': user_id_test}) +# %% +# 构建性别数据 +encoder = LabelEncoder() +encoder.fit(Y_train_gender) +Y_train_gender = encoder.transform(Y_train_gender) +Y_val_gender = encoder.transform(Y_val_gender) + +lgb_train_gender = lgb.Dataset(X_train, Y_train_gender) +lgb_eval_gender = lgb.Dataset(X_val, Y_val_gender, reference=lgb_train_gender) +# 构建年龄数据 +encoder = LabelEncoder() +encoder.fit(Y_train_age) +Y_train_age = encoder.transform(Y_train_age) +Y_val_age = encoder.transform(Y_val_age) + +lgb_train_age = lgb.Dataset(X_train, Y_train_age) +lgb_eval_age = lgb.Dataset(X_val, Y_val_age, reference=lgb_train_age) + +# %% + + +def LGBM_gender(): + params_gender = { + 'task': 'train', + 'boosting_type': 'gbdt', + 'objective': 'binary', + 'metric': {'binary_logloss', 'binary_error'}, # evaluate指标 + 'max_depth': -1, # 不限制树深度 + # 更高的accuracy + 'max_bin': 2**10-1, + + 'num_leaves': 2**10, + 'min_data_in_leaf': 1, + 'learning_rate': 0.01, + # 'feature_fraction': 0.9, + # 'bagging_fraction': 0.8, + # 'bagging_freq': 5, + # 'is_provide_training_metric': True, + 'verbose': 1 + } + print('Start training...') + # train + gbm = lgb.train(params_gender, + lgb_train_gender, + num_boost_round=50, + valid_sets=lgb_eval_gender, + # early_stopping_rounds=5, + ) + print('training done!') + print('Saving model...') + # save model to file + gbm.save_model('tmp/model_gender.txt') + print('save model done!') + return gbm +# %% + + +def LGBM_age(): + params_age = { + 'boosting_type': 'gbdt', + 'objective': 'multiclass', + "num_class": 10, + # fine-tuning最重要的三个参数 + 'num_leaves': 2**10-1, + 'max_depth': -1, # 不限制树深度 + 'min_data_in_leaf': 1, + # 更高的accuracy + # 'max_bin': 2**9-1, + + 'metric': {'multi_logloss', 'multi_error'}, + 'learning_rate': 0.1, + + # 'feature_fraction': 0.9, + # 'bagging_fraction': 0.8, + # 'bagging_freq': 5, + 'verbose': 1 + } + print('Start training...') + # train + gbm = lgb.train(params_age, + lgb_train_age, + num_boost_round=50, + valid_sets=lgb_eval_age, + # early_stopping_rounds=5, + ) + print('Saving model...') + # save model to file + gbm.save_model('tmp/model_age.txt') + print('save model done!') + return gbm + + +# %% +gbm_gender = LGBM_gender() +# %% +gbm_age = LGBM_age() +# %% + + +def evaluate(): + print('Start predicting...') + y_pred_gender_probability = gbm_gender.predict( + X_val, num_iteration=gbm_gender.best_iteration) + threshold = 0.5 + y_pred_gender = np.where(y_pred_gender_probability > threshold, 1, 0) + # eval + print('threshold: {:.1f} The accuracy of prediction is:{:.2f}'.format(threshold, + accuracy_score(Y_val_gender, y_pred_gender))) + print('Start evaluate data predicting...') + y_pred_age_probability = gbm_age.predict( + X_val, num_iteration=gbm_age.best_iteration) + y_pred_age = np.argmax(y_pred_age_probability, axis=1) + # eval + print('The accuracy of prediction is:{:.2f}'.format( + accuracy_score(Y_val_age, y_pred_age))) + + # d = {'user_id': X_val.user_id.values.tolist(), 'gender': Y_pred_gender.tolist(), + # 'age': y_pred_age.tolist()} + # ans_df = pd.DataFrame(data=d) + # # 投票的方式决定gender、age + # ans_df_grouped = ans_df.groupby(['user_id']).agg( + # lambda x: x.value_counts().index[0]) + # ans_df_grouped.gender = ans_df_grouped.gender+1 + # ans_df_grouped.age = ans_df_grouped.age+1 + # ans_df_grouped.to_csv('data/ans.csv', header=True) + + +# %% +evaluate() +# %% + + +def test(): + print('Start predicting test gender data ...') + y_pred_gender_probability = gbm_gender.predict( + X_test, num_iteration=gbm_gender.best_iteration) + threshold = 0.5 + y_pred_gender = np.where(y_pred_gender_probability > threshold, 1, 0) + + print('Start predicting test age data ...') + y_pred_age_probability = gbm_age.predict( + X_test, num_iteration=gbm_age.best_iteration) + y_pred_age = np.argmax(y_pred_age_probability, axis=1) + + ans['predicted_age'] = y_pred_age+1 + ans['predicted_gender'] = y_pred_gender+1 + ans.to_csv('data/ans/LGBM.csv', header=True, index=False, + columns=['user_id', 'predicted_age', 'predicted_gender']) + + # ans_df = pd.DataFrame(data=d) + # 投票的方式决定gender、age + # ans_df_grouped = ans_df.groupby(['user_id']).agg( + # lambda x: x.value_counts().index[0]) + # ans_df_grouped['user_id'] = ans_df_grouped.index + # ans_df_grouped.gender = ans_df_grouped.gender+1 + # ans_df_grouped.age = ans_df_grouped.age+1 + # columns_order = ['user_id', 'predicted_age', 'predicted_gender'] + # ans_df_grouped[columns_order].to_csv( + # 'data/ans_test.csv', header=True, columns=['user_id', 'predicted_age', 'predicted_gender'], index=False) + # print('Done!!!') + + +test() +# %% +# %% +# df_train = df_train.sort_values( +# ["user_id"], ascending=(True,)) + +# # %% + + +# def get_batch(file_name,): +# for row in open(file_name, "r"): +# yield 1 + + +# for line in get_batch('data/train_data.csv'): +# for line in get_batch('test.py'): +# print(line) +# break +# %% +# 合成用户embedding +# path = "word2vec/wordvectors.kv" +# wv = KeyedVectors.load(path, mmap='r') +# with open('word2vec/userid_creativeids.txt', 'r')as f: +# lines = f.readlines() +# lines = [[int(e) for e in line.split(' ')] for line in lines] +# number_train_user = 900000 +# number_test_user = 1000000 +# user_train = lines[:number_train_user] +# user_test = lines[number_train_user:] +# columns = ['c'+str(i) for i in range(128)] +# data = {} +# for col_name in columns: +# data[col_name] = pd.Series([], dtype='float') +# df_user_train = pd.DataFrame(data) +# df_user_test = pd.DataFrame(data) +# # %% +# for line in tqdm.tqdm(user_train): +# user_embedding_train = np.zeros(128) +# for creative_id in line: +# user_embedding_train += wv[str(creative_id)] +# user_embedding_train = user_embedding_train / len(line) +# tmp = pd.DataFrame(user_embedding_train.reshape(-1, +# len(user_embedding_train)), columns=columns) +# df_user_train = df_user_train.append(tmp) +# # %% +# for line in tqdm.tqdm(user_test): +# user_embedding_test = np.zeros(128) +# for creative_id in line: +# user_embedding_test += wv[str(creative_id)] +# user_embedding_test = user_embedding_test / len(line) +# tmp = pd.DataFrame(user_embedding_test.reshape(-1, +# len(user_embedding_train)), columns=columns) +# df_user_test = df_user_test.append(tmp) +# # %% +# # 将同一个用户creative_id相加平均后即为一个用户的Embedding +# all_train_data = pd.read_csv( +# 'data/train_preliminary/clicklog_ad_user_train_eval_test.csv') +# all_train_data = all_train_data.sort_values( +# ["user_id"], ascending=(True)) +# # %% +# all_test_data = pd.read_csv( +# 'data/test/clicklog_ad_user_test.csv') +# all_test_data = all_test_data.sort_values( +# ["user_id"], ascending=(True)) +# # %% +# assert df_user_train.shape[0] == all_train_data.shape[0] +# df_user_train['user_id'] = all_train_data['user_id'] +# df_user_train['gender'] = all_train_data['gender'] +# df_user_train['age'] = all_train_data['age'] +# df_user_train.to_hdf('word2vec/df_user_train_test.h5', +# key='df_user_train', mode='w') +# # %% +# assert df_user_test.shape[0] == all_test_data.shape[0] +# df_user_test['user_id'] = all_test_data['user_id'] +# df_user_test.to_hdf('word2vec/df_user_train_test.h5', +# key='df_user_test', mode='a') + + +# %% diff --git a/LightGBM_embedding.py b/LightGBM_embedding_16_dim.py similarity index 100% rename from LightGBM_embedding.py rename to LightGBM_embedding_16_dim.py diff --git a/test.py b/test.py index c266805..249631d 100644 --- a/test.py +++ b/test.py @@ -1,12 +1,127 @@ # %% +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split import pandas as pd -import os +import numpy as np +import lightgbm as lgb # %% -user_embeddings = [] -for i in range(os.cpu_count()): - tmp = pd.read_hdf( - '/tmp/df_user_embedding{}.h5'.format(i), key='df_user_embedding{}'.format(i), mode='r') - user_embeddings.append(tmp) +user = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) +# %% +Y_train_gender = user.gender +Y_train_age = user.age +corpus = [] +f = open('word2vec/userid_creativeids.txt', 'r') +flag = 0 +for row in f: + # row = [[int(e) for e in seq] for seq in row.strip().split(' ')] + row = row.strip() + corpus.append(row) + flag += 1 + if flag == 100: + break +# %% +Y_train_gender = Y_train_gender.iloc[:flag]-1 +Y_train_age = Y_train_age.iloc[:flag]-1 +# %% +vectorizer = TfidfVectorizer( + token_pattern=r"(?u)\b\w+\b", + min_df=1, + # max_features=128, + dtype=np.float32, +) +X_train = vectorizer.fit_transform(corpus) +print(X_train.shape) +# %% +X_train_gender, X_val_gender, Y_train_gender, Y_val_gender = train_test_split( + X_train, Y_train_gender, train_size=0.9, random_state=1) +lgb_train_gender = lgb.Dataset(X_train_gender, Y_train_gender) +lgb_eval_gender = lgb.Dataset( + X_val_gender, Y_val_gender, reference=lgb_train_gender) + +X_train_age, X_val_age, Y_train_age, Y_val_age = train_test_split( + X_train, Y_train_age, train_size=0.9, random_state=1) +lgb_train_age = lgb.Dataset(X_train_age, Y_train_age) +lgb_eval_age = lgb.Dataset( + X_val_age, Y_val_age, reference=lgb_train_age) +# %% + + +def LGBM_gender(epoch, early_stopping_rounds): + params_gender = { + 'task': 'train', + 'boosting_type': 'gbdt', + 'objective': 'binary', + 'metric': {'binary_logloss', 'binary_error'}, # evaluate指标 + 'max_depth': -1, # 不限制树深度 + # 更高的accuracy + 'max_bin': 2**10-1, + + 'num_leaves': 2**10, + 'min_data_in_leaf': 1, + 'learning_rate': 0.01, + # 'feature_fraction': 0.9, + # 'bagging_fraction': 0.8, + # 'bagging_freq': 5, + # 'is_provide_training_metric': True, + 'verbose': 1 + } + print('Start training...') + # train + gbm = lgb.train(params_gender, + lgb_train_gender, + num_boost_round=epoch, + valid_sets=lgb_eval_gender, + # early_stopping_rounds=5, + ) + print('training done!') + print('Saving model...') + # save model to file + gbm.save_model('tmp/model_gender.txt') + print('save model done!') + return gbm +# %% + + +def LGBM_age(epoch, early_stopping_rounds): + params_age = { + 'boosting_type': 'gbdt', + 'objective': 'multiclass', + "num_class": 10, + # fine-tuning最重要的三个参数 + 'num_leaves': 2**10-1, + 'max_depth': -1, # 不限制树深度 + 'min_data_in_leaf': 1, + # 更高的accuracy + # 'max_bin': 2**9-1, + # 'num_iterations': 50, # epoch + 'metric': {'multi_logloss', 'multi_error'}, + 'learning_rate': 0.01, + + 'feature_fraction': 0.9, + 'bagging_fraction': 0.8, + # 'bagging_freq': 5, + 'verbose': 1 + } + print('Start training...') + # train + gbm = lgb.train(params_age, + lgb_train_age, + num_boost_round=epoch, + valid_sets=lgb_eval_age, + early_stopping_rounds=1000, + ) + print('Saving model...') + # save model to file + gbm.save_model('tmp/model_age.txt') + print('save model done!') + return gbm + + +LGBM_age(epoch=10, early_stopping_rounds=1000) +# %% + +LGBM_gender(epoch=50, early_stopping_rounds=1000) # %% diff --git a/tf_idf.py b/tf_idf.py new file mode 100644 index 0000000..249631d --- /dev/null +++ b/tf_idf.py @@ -0,0 +1,127 @@ +# %% +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +import pandas as pd +import numpy as np +import lightgbm as lgb +# %% +user = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) +# %% +Y_train_gender = user.gender +Y_train_age = user.age +corpus = [] +f = open('word2vec/userid_creativeids.txt', 'r') +flag = 0 +for row in f: + # row = [[int(e) for e in seq] for seq in row.strip().split(' ')] + row = row.strip() + corpus.append(row) + flag += 1 + if flag == 100: + break +# %% +Y_train_gender = Y_train_gender.iloc[:flag]-1 +Y_train_age = Y_train_age.iloc[:flag]-1 +# %% +vectorizer = TfidfVectorizer( + token_pattern=r"(?u)\b\w+\b", + min_df=1, + # max_features=128, + dtype=np.float32, +) +X_train = vectorizer.fit_transform(corpus) +print(X_train.shape) +# %% +X_train_gender, X_val_gender, Y_train_gender, Y_val_gender = train_test_split( + X_train, Y_train_gender, train_size=0.9, random_state=1) +lgb_train_gender = lgb.Dataset(X_train_gender, Y_train_gender) +lgb_eval_gender = lgb.Dataset( + X_val_gender, Y_val_gender, reference=lgb_train_gender) + +X_train_age, X_val_age, Y_train_age, Y_val_age = train_test_split( + X_train, Y_train_age, train_size=0.9, random_state=1) +lgb_train_age = lgb.Dataset(X_train_age, Y_train_age) +lgb_eval_age = lgb.Dataset( + X_val_age, Y_val_age, reference=lgb_train_age) +# %% + + +def LGBM_gender(epoch, early_stopping_rounds): + params_gender = { + 'task': 'train', + 'boosting_type': 'gbdt', + 'objective': 'binary', + 'metric': {'binary_logloss', 'binary_error'}, # evaluate指标 + 'max_depth': -1, # 不限制树深度 + # 更高的accuracy + 'max_bin': 2**10-1, + + 'num_leaves': 2**10, + 'min_data_in_leaf': 1, + 'learning_rate': 0.01, + # 'feature_fraction': 0.9, + # 'bagging_fraction': 0.8, + # 'bagging_freq': 5, + # 'is_provide_training_metric': True, + 'verbose': 1 + } + print('Start training...') + # train + gbm = lgb.train(params_gender, + lgb_train_gender, + num_boost_round=epoch, + valid_sets=lgb_eval_gender, + # early_stopping_rounds=5, + ) + print('training done!') + print('Saving model...') + # save model to file + gbm.save_model('tmp/model_gender.txt') + print('save model done!') + return gbm +# %% + + +def LGBM_age(epoch, early_stopping_rounds): + params_age = { + 'boosting_type': 'gbdt', + 'objective': 'multiclass', + "num_class": 10, + # fine-tuning最重要的三个参数 + 'num_leaves': 2**10-1, + 'max_depth': -1, # 不限制树深度 + 'min_data_in_leaf': 1, + # 更高的accuracy + # 'max_bin': 2**9-1, + # 'num_iterations': 50, # epoch + 'metric': {'multi_logloss', 'multi_error'}, + 'learning_rate': 0.01, + + 'feature_fraction': 0.9, + 'bagging_fraction': 0.8, + # 'bagging_freq': 5, + 'verbose': 1 + } + print('Start training...') + # train + gbm = lgb.train(params_age, + lgb_train_age, + num_boost_round=epoch, + valid_sets=lgb_eval_age, + early_stopping_rounds=1000, + ) + print('Saving model...') + # save model to file + gbm.save_model('tmp/model_age.txt') + print('save model done!') + return gbm + + +LGBM_age(epoch=10, early_stopping_rounds=1000) +# %% + + +LGBM_gender(epoch=50, early_stopping_rounds=1000) + +# %%