diff --git a/LSTM_age_gender.py b/LSTM_age_gender.py new file mode 100644 index 0000000..21ffa91 --- /dev/null +++ b/LSTM_age_gender.py @@ -0,0 +1,504 @@ +# %% +import os +import tensorflow as tf +import numpy as np +import pandas as pd +from tqdm import tqdm +from tensorflow.keras import losses +from tensorflow.keras import optimizers +from tensorflow.keras import layers +from tensorflow.keras import models +from tensorflow.keras import callbacks +from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler +from tensorflow.keras.layers import Input, LSTM, Bidirectional, Embedding, Dense, Dropout, Concatenate +from tensorflow.keras.models import Model, Sequential +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.utils import to_categorical +from gensim.models import Word2Vec, KeyedVectors +from mymail import mail +import argparse + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +# %% +# 统计creative_id序列的长度,只需要统计一次 +# f = open('word2vec/userid_creative_ids.txt') +# LEN_creative_id = -1 +# for line in f: +# current_line_len = len(line.strip().split(' ')) +# LEN_creative_id = max(LEN_creative_id, current_line_len) +# f.close() +# %% +parser = argparse.ArgumentParser() +parser.add_argument('--load_from_npy', action='store_true', + help='从npy文件加载训练数据,不用每次训练都重新生成array文件', + default=False) +parser.add_argument('--not_train_embedding', action='store_false', + help='不训练embedding文件,一般来说加上这个参数效果不太好', + default=True) + +parser.add_argument('--epoch', type=int, + help='epoch 大小', + default=5) +parser.add_argument('--batch_size', type=int, + help='batch size大小', + default=256) +parser.add_argument('--train_examples', type=int, + help='训练数据,默认为训练集,不包含验证集,调试时候可以设置1000', + default=810000) +parser.add_argument('--val_examples', type=int, + help='验证集数据,调试时候可以设置1000', + default=90000) + + +parser.add_argument('--num_lstm', type=int, + help='LSTM层数个数,目前结果3层比5层好用,1层还在做实验中...', + default=1) + +args = parser.parse_args() +# %% +NUM_creative_id = 2481135+1 +NUM_ad_id = 2264190+1 +NUM_product_id = 33273+1 + + +def get_train_val(): + + # 提取词向量文件 + def get_embedding(feature_name, tokenizer): + path = f"word2vec/wordvectors_{feature_name}.kv" + wv = KeyedVectors.load(path, mmap='r') + feature_tokens = list(wv.vocab.keys()) + embedding_dim = 128 + embedding_matrix = np.random.randn( + len(feature_tokens)+1, embedding_dim) + for feature in feature_tokens: + embedding_vector = wv[feature] + if embedding_vector is not None: + index = tokenizer.texts_to_sequences([feature])[0][0] + embedding_matrix[index] = embedding_vector + return embedding_matrix + + # 从序列文件提取array格式数据 + def get_train(feature_name, vocab_size, len_feature): + f = open(f'word2vec/userid_{feature_name}s.txt') + tokenizer = Tokenizer(num_words=vocab_size) + tokenizer.fit_on_texts(f) + f.close() + + feature_seq = [] + with open(f'word2vec/userid_{feature_name}s.txt') as f: + for text in f: + feature_seq.append(text.strip()) + + sequences = tokenizer.texts_to_sequences(feature_seq[:900000//1]) + X_train = pad_sequences( + sequences, maxlen=len_feature, padding='post') + return X_train, tokenizer + + # 构造输出的训练标签 + # 获得age、gender标签 + DATA = {} + + user_train = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) + Y_gender = user_train['gender'].values + Y_age = user_train['age'].values + Y_gender = Y_gender - 1 + Y_age = Y_age - 1 + Y_age = to_categorical(Y_age) + Y_gender = to_categorical(Y_gender) + + DATA['Y_gender_train'] = Y_gender[:train_examples] + DATA['Y_gender_val'] = Y_gender[train_examples:] + DATA['Y_age_train'] = Y_age[:train_examples] + DATA['Y_age_val'] = Y_age[train_examples:] + + num_examples = Y_age.shape[0] + train_examples = int(num_examples * 0.9) + + # 第一个输入 + # 获取 creative_id 特征 + X1_train, tokenizer = get_train( + 'creative_id', NUM_creative_id, LEN_creative_id) + creative_id_emb = get_embedding('creative_id', tokenizer) + + DATA['X1_train'] = X1_train[:train_examples] + DATA['X1_val'] = X1_train[train_examples:] + DATA['creative_id_emb'] = creative_id_emb + + # 第二个输入 + # 获取 ad_id 特征 + X2_train, tokenizer = get_train( + 'ad_id', NUM_ad_id, LEN_ad_id) + ad_id_emb = get_embedding('ad_id', tokenizer) + + DATA['X2_train'] = X2_train[:train_examples] + DATA['X2_val'] = X2_train[train_examples:] + DATA['ad_id_emb'] = ad_id_emb + + # 第三个输入 + # 获取 product_id 特征 + X3_train, tokenizer = get_train( + 'product_id', NUM_product_id, LEN_product_id) + product_id_emb = get_embedding('product_id', tokenizer) + + DATA['X3_train'] = X3_train[:train_examples] + DATA['X3_val'] = X3_train[train_examples:] + DATA['product_id_emb'] = product_id_emb + + # 第四个输入 + # 获取 advertiser_id 特征 + X4_train, tokenizer = get_train( + 'advertiser_id', NUM_advertiser_id, LEN_advertiser_id) + advertiser_id_emb = get_embedding('advertiser_id', tokenizer) + + DATA['X4_train'] = X4_train[:train_examples] + DATA['X4_val'] = X4_train[train_examples:] + DATA['advertiser_id_emb'] = advertiser_id_emb + + # 第五个输入 + # 获取 industry 特征 + X5_train, tokenizer = get_train( + 'industry', NUM_industry, LEN_industry) + industry_emb = get_embedding('industry', tokenizer) + + DATA['X5_train'] = X5_train[:train_examples] + DATA['X5_val'] = X5_train[train_examples:] + DATA['industry_emb'] = industry_emb + + # 第六个输入 + # 获取 product_category 特征 + X6_train, tokenizer = get_train( + 'product_category', NUM_product_category, LEN_product_category) + product_category_emb = get_embedding('product_category', tokenizer) + + DATA['X6_train'] = X6_train[:train_examples] + DATA['X6_val'] = X6_train[train_examples:] + DATA['product_category_emb'] = product_category_emb + + return DATA + +# %% + + +def get_test(): + pass + + +# %% +LEN_creative_id = 100 +LEN_ad_id = 100 +LEN_product_id = 100 +LEN_advertiser_id = 100 +LEN_industry = 100 +LEN_product_category = 100 + + +def get_model(DATA): + # shape:(sequence长度, ) + # first input + input_creative_id = Input(shape=(None,), name='creative_id') + x1 = Embedding(input_dim=NUM_creative_id, + output_dim=128, + weights=[DATA['creative_id_emb']], + trainable=args.not_train_embedding, + input_length=LEN_creative_id, + mask_zero=True)(input_creative_id) + for _ in range(args.num_lstm): + x1 = Bidirectional(LSTM(256, return_sequences=True))(x1) + x1 = layers.GlobalMaxPooling1D()(x1) + + # second input + input_ad_id = Input(shape=(None,), name='ad_id') + x2 = Embedding(input_dim=NUM_ad_id, + output_dim=128, + weights=[DATA['ad_id_emb']], + trainable=args.not_train_embedding, + input_length=LEN_ad_id, + mask_zero=True)(input_ad_id) + for _ in range(args.num_lstm): + x2 = Bidirectional(LSTM(256, return_sequences=True))(x2) + x2 = layers.GlobalMaxPooling1D()(x2) + + # third input + input_product_id = Input(shape=(None,), name='product_id') + x3 = Embedding(input_dim=NUM_product_id, + output_dim=128, + weights=[DATA['product_id_emb']], + trainable=args.not_train_embedding, + input_length=LEN_product_id, + mask_zero=True)(input_product_id) + for _ in range(args.num_lstm): + x3 = Bidirectional(LSTM(256, return_sequences=True))(x3) + x3 = layers.GlobalMaxPooling1D()(x3) + + # concat x1 x2 + x = layers.Concatenate(axis=1)([x1, x2, x3]) + # x = Dense(128)(x) + # x = Dropout(0.1)(x) + output_y = Dense(10, activation='softmax')(x) + + model = Model([input_creative_id, input_ad_id, input_product_id], output_y) + model.compile(loss='categorical_crossentropy', + optimizer='adam', metrics=['accuracy']) + model.summary() + + return model + +# %% + + +def get_model_head_concat(DATA): + # shape:(sequence长度, ) + # first input + input_creative_id = Input(shape=(None,), name='creative_id') + x1 = Embedding(input_dim=NUM_creative_id, + output_dim=128, + weights=[DATA['creative_id_emb']], + trainable=args.not_train_embedding, + input_length=LEN_creative_id, + mask_zero=True)(input_creative_id) + + input_ad_id = Input(shape=(None,), name='ad_id') + x2 = Embedding(input_dim=NUM_ad_id, + output_dim=128, + weights=[DATA['ad_id_emb']], + trainable=args.not_train_embedding, + input_length=LEN_ad_id, + mask_zero=True)(input_ad_id) + + input_product_id = Input(shape=(None,), name='product_id') + x3 = Embedding(input_dim=NUM_product_id, + output_dim=128, + weights=[DATA['product_id_emb']], + trainable=args.not_train_embedding, + input_length=LEN_product_id, + mask_zero=True)(input_product_id) + + input_advertiser_id = Input(shape=(None,), name='advertiser_id') + x4 = Embedding(input_dim=NUM_advertiser_id, + output_dim=128, + weights=[DATA['advertiser_id_emb']], + trainable=args.not_train_embedding, + input_length=LEN_advertiser_id, + mask_zero=True)(input_advertiser_id) + + input_industry = Input(shape=(None,), name='industry') + x5 = Embedding(input_dim=NUM_industry, + output_dim=128, + weights=[DATA['industry_emb']], + trainable=args.not_train_embedding, + input_length=LEN_industry, + mask_zero=True)(input_industry) + + input_product_category = Input(shape=(None,), name='product_category') + x6 = Embedding(input_dim=NUM_product_category, + output_dim=128, + weights=[DATA['product_category_emb']], + trainable=args.not_train_embedding, + input_length=LEN_product_category, + mask_zero=True)(input_product_category) + + x = Concatenate(axis=1)([x1, x2, x3, x4, x5, x6]) + + for _ in range(args.num_lstm): + x = Bidirectional(LSTM(256, return_sequences=True))(x) + x = layers.GlobalMaxPooling1D()(x) + # x = layers.GlobalAvaregePooling1D()(x) + + output_gender = Dense(2, activation='softmax', name='gender')(x) + output_age = Dense(10, activation='softmax', name='age')(x) + + model = Model( + [ + input_creative_id, + input_ad_id, + input_product_id, + input_advertiser_id, + input_industry, + input_product_category + ], + [ + output_gender, + output_age + ] + ) + model.compile( + optimizer=optimizers.Adam(1e-4), + loss={'gender': losses.CategoricalCrossentropy(from_logits=False), + 'age': losses.CategoricalCrossentropy(from_logits=False)}, + loss_weights=[0.5, 0.5], + metrics=['accuracy']) + model.summary() + + return model + + +# %% +if not args.load_from_npy: + mail('start getting train data') + print('从csv文件提取训练数据到array格式,大概十分钟时间') + DATA = get_train_val() + mail('get train data done.') + + # 训练数据保存为npy文件 + dirs = 'tmp/' + if not os.path.exists(dirs): + os.makedirs(dirs) + + def save_npy(datas, name): + for i, data in enumerate(datas): + np.save(f'tmp/{name}_{i}.npy', data) + + inputs = [ + DATA['X1_train'], DATA['X1_val'], + DATA['X2_train'], DATA['X2_val'], + DATA['X3_train'], DATA['X3_val'], + DATA['X4_train'], DATA['X4_val'], + DATA['X5_train'], DATA['X5_val'], + DATA['X6_train'], DATA['X6_val'], + ] + outputs_age = [DATA['Y_train'], DATA['Y_val']] + outputs_gender = [DATA['Y_train'], DATA['Y_val']] + embeddings = [ + DATA['creative_id_emb'], + DATA['ad_id_emb'], + DATA['product_id_emb'], + DATA['advertiser_id_emb'], + DATA['industry_emb'], + DATA['product_category_emb'], + ] + save_npy(inputs, 'inputs') + save_npy(outputs_age, 'age') + save_npy(outputs_gender, 'gender') + save_npy(embeddings, 'embeddings') +else: + DATA = {} + DATA['X1_train'] = np.load('tmp/inputs_0.npy', allow_pickle=True) + DATA['X1_val'] = np.load('tmp/inputs_1.npy', allow_pickle=True) + DATA['X2_train'] = np.load('tmp/inputs_2.npy', allow_pickle=True) + DATA['X2_val'] = np.load('tmp/inputs_3.npy', allow_pickle=True) + DATA['X3_train'] = np.load('tmp/inputs_4.npy', allow_pickle=True) + DATA['X3_val'] = np.load('tmp/inputs_5.npy', allow_pickle=True) + DATA['X4_train'] = np.load('tmp/inputs_6.npy', allow_pickle=True) + DATA['X4_val'] = np.load('tmp/inputs_7.npy', allow_pickle=True) + DATA['X5_train'] = np.load('tmp/inputs_8.npy', allow_pickle=True) + DATA['X5_val'] = np.load('tmp/inputs_9.npy', allow_pickle=True) + DATA['X6_train'] = np.load('tmp/inputs_10.npy', allow_pickle=True) + DATA['X6_val'] = np.load('tmp/inputs_11.npy', allow_pickle=True) + DATA['Y_gender_train'] = np.load('tmp/gender_0.npy', allow_pickle=True) + DATA['Y_gender_val'] = np.load('tmp/gender_1.npy', allow_pickle=True) + DATA['Y_age_train'] = np.load('tmp/age_0.npy', allow_pickle=True) + DATA['Y_age_val'] = np.load('tmp/age_1.npy', allow_pickle=True) + DATA['creative_id_emb'] = np.load( + 'tmp/embeddings_0.npy', allow_pickle=True) + DATA['ad_id_emb'] = np.load( + 'tmp/embeddings_1.npy', allow_pickle=True) + DATA['product_id_emb'] = np.load( + 'tmp/embeddings_2.npy', allow_pickle=True) + DATA['advertiser_id_emb'] = np.load( + 'tmp/embeddings_3.npy', allow_pickle=True) + DATA['industry_emb'] = np.load( + 'tmp/embeddings_4.npy', allow_pickle=True) + DATA['product_category_emb'] = np.load( + 'tmp/embeddings_5.npy', allow_pickle=True) + + +# %% +# model = get_model(DATA) +model = get_model_head_concat(DATA) +# %% +# %% +# 测试数据格式(batch_size, sequence长度) +# x1 = np.array([1, 2, 3, 4]).reshape(1, -1) +# x2 = np.array([1, 2, 3, 4]).reshape(1, -1) +# model.predict([x1, x2]) + + +# %% +def scheduler(epoch): + if epoch < 10: + return 0.001 + else: + return 0.001 * tf.math.exp(0.1 * (10 - epoch)) + + +lr = LearningRateScheduler(scheduler) +checkpoint = ModelCheckpoint("tmp/age_epoch_{epoch:02d}.hdf5", monitor='val_loss', verbose=1, + save_best_only=False, mode='auto', period=1) +# %% +try: + train_examples = args.train_examples + val_examples = args.val_examples + mail('start train lstm') + model.fit( + { + 'creative_id': DATA['X1_train'][:train_examples], + 'ad_id': DATA['X2_train'][:train_examples], + 'product_id': DATA['X3_train'][:train_examples] + 'advertiser_id': DATA['X4_train'][:train_examples], + 'industry': DATA['X5_train'][:train_examples], + 'product_category': DATA['X6_train'][:train_examples] + }, + { + 'gender': DATA['Y_gender_train'][:train_examples], + 'age': DATA['Y_age_train'][:train_examples], + }, + validation_data=( + { + 'creative_id': DATA['X1_val'][:val_examples], + 'ad_id': DATA['X2_val'][:val_examples], + 'product_id': DATA['X3_val'][:val_examples] + 'advertiser_id': DATA['X4_val'][:val_examples], + 'industry': DATA['X5_val'][:val_examples], + 'product_category': DATA['X6_val'][:val_examples] + }, + { + 'gender': DATA['Y_gender_val'][:val_examples], + 'age': DATA['Y_age_val'][:val_examples], + }, + ), + epochs=args.epoch, + batch_size=args.batch_size, + callbacks=[checkpoint], + ) + mail('train lstm done!!!') +except Exception as e: + e = str(e) + mail('train lstm failed!!! ' + e) + + +# %% +# 后续为预测过程,暂时注释掉不使用但是不要删除 +# model.load_weights('tmp\gender_epoch_01.hdf5') + + +# # %% +# if debug: +# sequences = tokenizer.texts_to_sequences( +# creative_id_seq[900000:]) +# else: +# sequences = tokenizer.texts_to_sequences( +# creative_id_seq[900000:]) + +# X_test = pad_sequences(sequences, maxlen=LEN_creative_id) +# # %% +# y_pred = model.predict(X_test, batch_size=4096) + +# y_pred = np.where(y_pred > 0.5, 1, 0) +# y_pred = y_pred.flatten() + +# # %% +# y_pred = y_pred+1 +# # %% +# res = pd.DataFrame({'predicted_gender': y_pred}) +# res.to_csv( +# 'data/ans/lstm_gender.csv', header=True, columns=['predicted_gender'], index=False) + + +# # %% +# mail('predict lstm gender done') + +# %% diff --git a/LSTM_age_multi_input.py b/LSTM_age_multi_input.py index 841cb18..5be3865 100644 --- a/LSTM_age_multi_input.py +++ b/LSTM_age_multi_input.py @@ -90,7 +90,22 @@ def get_train(feature_name, vocab_size, len_feature): sequences, maxlen=len_feature, padding='post') return X_train, tokenizer + # 构造输出的训练标签 + # 获得age、gender标签 DATA = {} + + user_train = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) + Y_gender = user_train['gender'].values + Y_age = user_train['age'].values + Y_gender = Y_gender - 1 + Y_age = Y_age - 1 + Y_age = to_categorical(Y_age) + Y_gender = to_categorical(Y_gender) + + DATA['Y_train'] = Y_age[:train_examples] + DATA['Y_val'] = Y_age[train_examples:] + num_examples = Y_age.shape[0] train_examples = int(num_examples * 0.9) @@ -124,20 +139,6 @@ def get_train(feature_name, vocab_size, len_feature): DATA['X3_val'] = X3_train[train_examples:] DATA['product_id_emb'] = product_id_emb - # 构造输出的训练标签 - # 获得age、gender标签 - user_train = pd.read_csv( - 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) - Y_gender = user_train['gender'].values - Y_age = user_train['age'].values - Y_gender = Y_gender - 1 - Y_age = Y_age - 1 - Y_age = to_categorical(Y_age) - Y_gender = to_categorical(Y_gender) - - DATA['Y_train'] = Y_age[:train_examples] - DATA['Y_val'] = Y_age[train_examples:] - # 分别对应 x1_train x1_val x2_train x2_val y_train y_val return DATA diff --git a/Transformer_keras.py b/Transformer_keras.py index a9156b7..1951b2f 100644 --- a/Transformer_keras.py +++ b/Transformer_keras.py @@ -249,7 +249,7 @@ def get_age_model(creative_id_emb, ad_id_emb, product_id_emb): x3 = layers.GlobalMaxPooling1D()(x3) # concat x1 x2 x3 - x = concatenate([x1, x2, x3]) + x = Concatenate(axis=1)([x1, x2, x3]) # x = x1 + x2 + x3 x = Dense(20)(x) # x = Dropout(0.1)(x) @@ -282,7 +282,7 @@ def get_model_head_concat(creative_id_emb, ad_id_emb, product_id_emb): input_product_id = Input(shape=(None,), name='product_id') x3 = TokenAndPositionEmbedding( maxlen, NUM_product_id, embed_dim, product_id_emb)(input_product_id) - + # x = concatenate([x1, x2, x3]) x = layers.Concatenate(axis=1)([x1, x2, x3])