From 01d8b2f18bdac88025ed8efd3e39becb0c2e71b4 Mon Sep 17 00:00:00 2001 From: sunlanchang Date: Mon, 15 Jun 2020 11:57:45 +0800 Subject: [PATCH] update LSTM --- LSTM_age_multi_input.py | 90 ++++------- LSTM_gender_multi_input.py | 203 +++++++++++++----------- LSTM_gender_multi_input_old.py | 279 +++++++++++++++++++++++++++++++++ test.py | 98 ++++++++++++ 4 files changed, 519 insertions(+), 151 deletions(-) create mode 100644 LSTM_gender_multi_input_old.py create mode 100644 test.py diff --git a/LSTM_age_multi_input.py b/LSTM_age_multi_input.py index 4359ec3..8721d2a 100644 --- a/LSTM_age_multi_input.py +++ b/LSTM_age_multi_input.py @@ -18,6 +18,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "0" # %% +# 统计creative_id序列的长度,只需要统计一次 # f = open('word2vec/userid_creative_ids.txt') # LEN_creative_id = -1 # for line in f: @@ -27,10 +28,10 @@ # %% parser = argparse.ArgumentParser() parser.add_argument('--load_from_npy', action='store_true', - help='从npy文件加载数据', + help='从npy文件加载训练数据,不用每次训练都重新生成array文件', default=False) parser.add_argument('--not_train_embedding', action='store_false', - help='从npy文件加载数据', + help='不训练embedding文件,一般来说加上这个参数效果不太好', default=True) parser.add_argument('--epoch', type=int, @@ -40,12 +41,12 @@ help='batch size大小', default=256) parser.add_argument('--examples', type=int, - help='训练数据,默认为训练集,不包含验证集', + help='训练数据,默认为训练集,不包含验证集,调试时候可以设置1000', default=810000) parser.add_argument('--num_lstm', type=int, - help='LSTM head个数', + help='LSTM层数个数,目前结果3层比5层好用,1层还在做实验中...', default=1) args = parser.parse_args() @@ -57,6 +58,22 @@ def get_train_val(): + # 提取词向量文件 + def get_embedding(feature_name): + path = f"word2vec/wordvectors_{feature_name}.kv" + wv = KeyedVectors.load(path, mmap='r') + feature_tokens = list(wv.vocab.keys()) + embedding_dim = 128 + embedding_matrix = np.random.randn( + len(feature_tokens)+1, embedding_dim) + for feature in feature_tokens: + embedding_vector = wv[feature] + if embedding_vector is not None: + index = tokenizer.texts_to_sequences([feature])[0][0] + embedding_matrix[index] = embedding_vector + return embedding_matrix + + # 第一个输入 # 获取 creative_id 特征 # f = open('tmp/userid_creative_ids.txt') f = open('word2vec/userid_creative_ids.txt') @@ -72,23 +89,9 @@ def get_train_val(): X1_train = pad_sequences( sequences, maxlen=LEN_creative_id, padding='post') - # 获取creative_id embedding - def get_creative_id_emb(): - path = "word2vec/wordvectors_creative_id.kv" - wv = KeyedVectors.load(path, mmap='r') - creative_id_tokens = list(wv.vocab.keys()) - embedding_dim = 128 - embedding_matrix = np.random.randn( - len(creative_id_tokens)+1, embedding_dim) - for creative_id in creative_id_tokens: - embedding_vector = wv[creative_id] - if embedding_vector is not None: - index = tokenizer.texts_to_sequences([creative_id])[0][0] - embedding_matrix[index] = embedding_vector - return embedding_matrix - - creative_id_emb = get_creative_id_emb() + creative_id_emb = get_embedding(feature_name='creative_id') + # 第二个输入 # 获取 ad_id 特征 f = open('word2vec/userid_ad_ids.txt') tokenizer = Tokenizer(num_words=NUM_ad_id) @@ -103,22 +106,9 @@ def get_creative_id_emb(): X2_train = pad_sequences( sequences, maxlen=LEN_ad_id, padding='post') - def get_ad_id_emb(): - path = "word2vec/wordvectors_ad_id.kv" - wv = KeyedVectors.load(path, mmap='r') - ad_id_tokens = list(wv.vocab.keys()) - embedding_dim = 128 - embedding_matrix = np.random.randn( - len(ad_id_tokens)+1, embedding_dim) - for ad_id in ad_id_tokens: - embedding_vector = wv[ad_id] - if embedding_vector is not None: - index = tokenizer.texts_to_sequences([ad_id])[0][0] - embedding_matrix[index] = embedding_vector - return embedding_matrix - - ad_id_emb = get_ad_id_emb() + ad_id_emb = get_embedding(feature_name='ad_id') + # 第三个输入 # 获取 product_id 特征 # f = open('tmp/userid_product_ids.txt') f = open('word2vec/userid_product_ids.txt') @@ -134,24 +124,10 @@ def get_ad_id_emb(): X3_train = pad_sequences( sequences, maxlen=LEN_product_id, padding='post') - # 获取product_id embedding - def get_product_id_emb(): - path = "word2vec/wordvectors_product_id.kv" - wv = KeyedVectors.load(path, mmap='r') - product_id_tokens = list(wv.vocab.keys()) - embedding_dim = 128 - embedding_matrix = np.random.randn( - len(product_id_tokens)+1, embedding_dim) - for product_id in product_id_tokens: - embedding_vector = wv[product_id] - if embedding_vector is not None: - index = tokenizer.texts_to_sequences([product_id])[0][0] - embedding_matrix[index] = embedding_vector - return embedding_matrix - - product_id_emb = get_product_id_emb() + product_id_emb = get_embedding(feature_name='product_id') - # 获得age标签 + # 构造输出的训练标签 + # 获得age、gender标签 user_train = pd.read_csv( 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) Y_gender = user_train['gender'].values @@ -268,15 +244,6 @@ def save_data(datas): # %% checkpoint = ModelCheckpoint("tmp/age_epoch_{epoch:02d}.hdf5", monitor='val_loss', verbose=1, save_best_only=False, mode='auto', period=1) -# %% -# model.fit( -# {'creative_id': x1_train, 'ad_id': x2_train}, -# y_train, -# validation_data=([x1_val, x2_val], y_val), -# epochs=5, -# batch_size=256, -# callbacks=[checkpoint], -# ) # %% try: @@ -298,6 +265,7 @@ def save_data(datas): # %% +# 后续为预测过程,暂时注释掉不使用但是不要删除 # model.load_weights('tmp\gender_epoch_01.hdf5') diff --git a/LSTM_gender_multi_input.py b/LSTM_gender_multi_input.py index f465851..945f20a 100644 --- a/LSTM_gender_multi_input.py +++ b/LSTM_gender_multi_input.py @@ -1,28 +1,55 @@ # %% # 生成词嵌入文件 -from tqdm import tqdm +import os +import tensorflow as tf import numpy as np import pandas as pd +from tqdm import tqdm from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler -from gensim.models import Word2Vec, KeyedVectors -from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, concatenate +from tensorflow.keras import layers +from tensorflow.keras.layers import Input, LSTM, Bidirectional, Embedding, Dense, Dropout, concatenate from tensorflow.keras.models import Model, Sequential -import tensorflow as tf from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.utils import to_categorical +from gensim.models import Word2Vec, KeyedVectors from mymail import mail -import os +import argparse os.environ["CUDA_VISIBLE_DEVICES"] = "0" # %% +# 统计creative_id序列的长度,只需要统计一次 # f = open('word2vec/userid_creative_ids.txt') # LEN_creative_id = -1 # for line in f: # current_line_len = len(line.strip().split(' ')) # LEN_creative_id = max(LEN_creative_id, current_line_len) # f.close() - - +# %% +parser = argparse.ArgumentParser() +parser.add_argument('--load_from_npy', action='store_true', + help='从npy文件加载训练数据,不用每次训练都重新生成array文件', + default=False) +parser.add_argument('--not_train_embedding', action='store_false', + help='不训练embedding文件,一般来说加上这个参数效果不太好', + default=True) + +parser.add_argument('--epoch', type=int, + help='epoch 大小', + default=5) +parser.add_argument('--batch_size', type=int, + help='batch size大小', + default=256) +parser.add_argument('--examples', type=int, + help='训练数据,默认为训练集,不包含验证集,调试时候可以设置1000', + default=810000) + + +parser.add_argument('--num_lstm', type=int, + help='LSTM层数个数,目前结果3层比5层好用,1层还在做实验中...', + default=1) + +args = parser.parse_args() # %% NUM_creative_id = 2481135+1 NUM_ad_id = 2264190+1 @@ -31,6 +58,22 @@ def get_train_val(): + # 提取词向量文件 + def get_embedding(feature_name): + path = f"word2vec/wordvectors_{feature_name}.kv" + wv = KeyedVectors.load(path, mmap='r') + feature_tokens = list(wv.vocab.keys()) + embedding_dim = 128 + embedding_matrix = np.random.randn( + len(feature_tokens)+1, embedding_dim) + for feature in feature_tokens: + embedding_vector = wv[feature] + if embedding_vector is not None: + index = tokenizer.texts_to_sequences([feature])[0][0] + embedding_matrix[index] = embedding_vector + return embedding_matrix + + # 第一个输入 # 获取 creative_id 特征 # f = open('tmp/userid_creative_ids.txt') f = open('word2vec/userid_creative_ids.txt') @@ -38,7 +81,7 @@ def get_train_val(): tokenizer.fit_on_texts(f) f.close() creative_id_seq = [] - with open('word2vec/userid_creative_ids.txt', 'r') as f: + with open('word2vec/userid_creative_ids.txt') as f: for text in f: creative_id_seq.append(text.strip()) @@ -46,23 +89,9 @@ def get_train_val(): X1_train = pad_sequences( sequences, maxlen=LEN_creative_id, padding='post') - # 获取creative_id embedding - def get_creative_id_emb(): - path = "word2vec/wordvectors_creative_id.kv" - wv = KeyedVectors.load(path, mmap='r') - creative_id_tokens = list(wv.vocab.keys()) - embedding_dim = 128 - embedding_matrix = np.random.randn( - len(creative_id_tokens)+1, embedding_dim) - for creative_id in creative_id_tokens: - embedding_vector = wv[creative_id] - if embedding_vector is not None: - index = tokenizer.texts_to_sequences([creative_id])[0][0] - embedding_matrix[index] = embedding_vector - return embedding_matrix - - creative_id_emb = get_creative_id_emb() + creative_id_emb = get_embedding(feature_name='creative_id') + # 第二个输入 # 获取 ad_id 特征 f = open('word2vec/userid_ad_ids.txt') tokenizer = Tokenizer(num_words=NUM_ad_id) @@ -77,22 +106,9 @@ def get_creative_id_emb(): X2_train = pad_sequences( sequences, maxlen=LEN_ad_id, padding='post') - def get_ad_id_emb(): - path = "word2vec/wordvectors_ad_id.kv" - wv = KeyedVectors.load(path, mmap='r') - ad_id_tokens = list(wv.vocab.keys()) - embedding_dim = 128 - embedding_matrix = np.random.randn( - len(ad_id_tokens)+1, embedding_dim) - for ad_id in ad_id_tokens: - embedding_vector = wv[ad_id] - if embedding_vector is not None: - index = tokenizer.texts_to_sequences([ad_id])[0][0] - embedding_matrix[index] = embedding_vector - return embedding_matrix - - ad_id_emb = get_ad_id_emb() + ad_id_emb = get_embedding(feature_name='ad_id') + # 第三个输入 # 获取 product_id 特征 # f = open('tmp/userid_product_ids.txt') f = open('word2vec/userid_product_ids.txt') @@ -108,30 +124,19 @@ def get_ad_id_emb(): X3_train = pad_sequences( sequences, maxlen=LEN_product_id, padding='post') - # 获取product_id embedding - def get_product_id_emb(): - path = "word2vec/wordvectors_product_id.kv" - wv = KeyedVectors.load(path, mmap='r') - product_id_tokens = list(wv.vocab.keys()) - embedding_dim = 128 - embedding_matrix = np.random.randn( - len(product_id_tokens)+1, embedding_dim) - for product_id in product_id_tokens: - embedding_vector = wv[product_id] - if embedding_vector is not None: - index = tokenizer.texts_to_sequences([product_id])[0][0] - embedding_matrix[index] = embedding_vector - return embedding_matrix - - product_id_emb = get_product_id_emb() + product_id_emb = get_embedding(feature_name='product_id') - # 获得gender标签 + # 构造输出的训练标签 + # 获得age、gender标签 user_train = pd.read_csv( 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) Y_gender = user_train['gender'].values Y_age = user_train['age'].values Y_gender = Y_gender - 1 - num_examples = Y_gender.shape[0] + Y_age = Y_age - 1 + Y_age = to_categorical(Y_age) + Y_gender = to_categorical(Y_gender) + num_examples = Y_age.shape[0] train_examples = int(num_examples * 0.9) # 分别对应 x1_train x1_val x2_train x2_val y_train y_val @@ -157,45 +162,45 @@ def get_gender_model(creative_id_emb, ad_id_emb, product_id_emb): x1 = Embedding(input_dim=NUM_creative_id, output_dim=128, weights=[creative_id_emb], - trainable=True, + trainable=args.not_train_embedding, input_length=LEN_creative_id, mask_zero=True)(input_creative_id) - x1 = LSTM(1024, return_sequences=True)(x1) - x1 = LSTM(512, return_sequences=True)(x1) - x1 = LSTM(256, return_sequences=False)(x1) + for _ in range(args.num_lstm): + x1 = Bidirectional(LSTM(256, return_sequences=True))(x1) + x1 = layers.GlobalMaxPooling1D()(x1) # second input input_ad_id = Input(shape=(None,), name='ad_id') x2 = Embedding(input_dim=NUM_ad_id, output_dim=128, weights=[ad_id_emb], - trainable=True, + trainable=args.not_train_embedding, input_length=LEN_ad_id, mask_zero=True)(input_ad_id) - x2 = LSTM(1024, return_sequences=True)(x2) - x2 = LSTM(512, return_sequences=True)(x2) - x2 = LSTM(256, return_sequences=False)(x2) + for _ in range(args.num_lstm): + x2 = Bidirectional(LSTM(256, return_sequences=True))(x2) + x2 = layers.GlobalMaxPooling1D()(x2) # third input input_product_id = Input(shape=(None,), name='product_id') x3 = Embedding(input_dim=NUM_product_id, output_dim=128, weights=[product_id_emb], - trainable=True, + trainable=args.not_train_embedding, input_length=LEN_product_id, mask_zero=True)(input_product_id) - x3 = LSTM(1024, return_sequences=True)(x3) - x3 = LSTM(512, return_sequences=True)(x3) - x3 = LSTM(256, return_sequences=False)(x3) + for _ in range(args.num_lstm): + x3 = Bidirectional(LSTM(256, return_sequences=True))(x3) + x3 = layers.GlobalMaxPooling1D()(x3) # concat x1 x2 x = concatenate([x1, x2, x3]) - x = Dense(128)(x) - x = Dropout(0.1)(x) - output_y = Dense(1, activation='sigmoid')(x) + # x = Dense(128)(x) + # x = Dropout(0.1)(x) + output_y = Dense(2, activation='softmax')(x) model = Model([input_creative_id, input_ad_id, input_product_id], output_y) - model.compile(loss='binary_crossentropy', + model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() @@ -203,10 +208,34 @@ def get_gender_model(creative_id_emb, ad_id_emb, product_id_emb): # %% -mail('start getting train data') -x1_train, x1_val, x2_train, x2_val, x3_train, x3_val, y_train, y_val, creative_id_emb, ad_id_emb, product_id_emb = get_train_val() -mail('get train data done.') +if not args.load_from_npy: + mail('start getting train data') + x1_train, x1_val, x2_train, x2_val, x3_train, x3_val, y_train, y_val, creative_id_emb, ad_id_emb, product_id_emb = get_train_val() + mail('get train data done.') + + def save_data(datas): + dirs = 'tmp/' + if not os.path.exists(dirs): + os.makedirs(dirs) + for i, data in enumerate(datas): + np.save(f'tmp/transformer_input_{i}.npy', data) + datas = [x1_train, x1_val, x2_train, x2_val, x3_train, x3_val, + y_train, y_val, creative_id_emb, ad_id_emb, product_id_emb] + save_data(datas) +else: + x1_train = np.load('tmp/transformer_input_0.npy', allow_pickle=True) + x1_val = np.load('tmp/transformer_input_1.npy', allow_pickle=True) + x2_train = np.load('tmp/transformer_input_2.npy', allow_pickle=True) + x2_val = np.load('tmp/transformer_input_3.npy', allow_pickle=True) + x3_train = np.load('tmp/transformer_input_4.npy', allow_pickle=True) + x3_val = np.load('tmp/transformer_input_5.npy', allow_pickle=True) + y_train = np.load('tmp/transformer_input_6.npy', allow_pickle=True) + y_val = np.load('tmp/transformer_input_7.npy', allow_pickle=True) + creative_id_emb = np.load('tmp/transformer_input_8.npy', allow_pickle=True) + ad_id_emb = np.load('tmp/transformer_input_9.npy', allow_pickle=True) + product_id_emb = np.load('tmp/transformer_input_10.npy', allow_pickle=True) +# %% model = get_gender_model(creative_id_emb, ad_id_emb, product_id_emb) # %% # %% @@ -219,34 +248,28 @@ def get_gender_model(creative_id_emb, ad_id_emb, product_id_emb): # %% checkpoint = ModelCheckpoint("tmp/gender_epoch_{epoch:02d}.hdf5", monitor='val_loss', verbose=1, save_best_only=False, mode='auto', period=1) -# %% -# model.fit( -# {'creative_id': x1_train, 'ad_id': x2_train}, -# y_train, -# validation_data=([x1_val, x2_val], y_val), -# epochs=5, -# batch_size=256, -# callbacks=[checkpoint], -# ) # %% try: + examples = args.examples mail('start train lstm') model.fit( - {'creative_id': x1_train, 'ad_id': x2_train, 'product_id': x3_train}, - y_train, + {'creative_id': x1_train[:examples], 'ad_id': x2_train[:examples], + 'product_id': x3_train[:examples]}, + y_train[:examples], validation_data=([x1_val, x2_val, x3_val], y_val), - epochs=3, - batch_size=256, + epochs=args.epoch, + batch_size=args.batch_size, callbacks=[checkpoint], ) - mail('train gender lstm done!!!') + mail('train lstm done!!!') except Exception as e: e = str(e) mail('train lstm failed!!! ' + e) # %% +# 后续为预测过程,暂时注释掉不使用但是不要删除 # model.load_weights('tmp\gender_epoch_01.hdf5') diff --git a/LSTM_gender_multi_input_old.py b/LSTM_gender_multi_input_old.py new file mode 100644 index 0000000..f465851 --- /dev/null +++ b/LSTM_gender_multi_input_old.py @@ -0,0 +1,279 @@ +# %% +# 生成词嵌入文件 +from tqdm import tqdm +import numpy as np +import pandas as pd +from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler +from gensim.models import Word2Vec, KeyedVectors +from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, concatenate +from tensorflow.keras.models import Model, Sequential +import tensorflow as tf +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.preprocessing.text import Tokenizer +from mymail import mail +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +# %% +# f = open('word2vec/userid_creative_ids.txt') +# LEN_creative_id = -1 +# for line in f: +# current_line_len = len(line.strip().split(' ')) +# LEN_creative_id = max(LEN_creative_id, current_line_len) +# f.close() + + +# %% +NUM_creative_id = 2481135+1 +NUM_ad_id = 2264190+1 +NUM_product_id = 33273+1 + + +def get_train_val(): + + # 获取 creative_id 特征 + # f = open('tmp/userid_creative_ids.txt') + f = open('word2vec/userid_creative_ids.txt') + tokenizer = Tokenizer(num_words=NUM_creative_id) + tokenizer.fit_on_texts(f) + f.close() + creative_id_seq = [] + with open('word2vec/userid_creative_ids.txt', 'r') as f: + for text in f: + creative_id_seq.append(text.strip()) + + sequences = tokenizer.texts_to_sequences(creative_id_seq[:900000//1]) + X1_train = pad_sequences( + sequences, maxlen=LEN_creative_id, padding='post') + + # 获取creative_id embedding + def get_creative_id_emb(): + path = "word2vec/wordvectors_creative_id.kv" + wv = KeyedVectors.load(path, mmap='r') + creative_id_tokens = list(wv.vocab.keys()) + embedding_dim = 128 + embedding_matrix = np.random.randn( + len(creative_id_tokens)+1, embedding_dim) + for creative_id in creative_id_tokens: + embedding_vector = wv[creative_id] + if embedding_vector is not None: + index = tokenizer.texts_to_sequences([creative_id])[0][0] + embedding_matrix[index] = embedding_vector + return embedding_matrix + + creative_id_emb = get_creative_id_emb() + + # 获取 ad_id 特征 + f = open('word2vec/userid_ad_ids.txt') + tokenizer = Tokenizer(num_words=NUM_ad_id) + tokenizer.fit_on_texts(f) + f.close() + ad_id_seq = [] + with open('word2vec/userid_ad_ids.txt') as f: + for text in f: + ad_id_seq.append(text.strip()) + + sequences = tokenizer.texts_to_sequences(ad_id_seq[:900000//1]) + X2_train = pad_sequences( + sequences, maxlen=LEN_ad_id, padding='post') + + def get_ad_id_emb(): + path = "word2vec/wordvectors_ad_id.kv" + wv = KeyedVectors.load(path, mmap='r') + ad_id_tokens = list(wv.vocab.keys()) + embedding_dim = 128 + embedding_matrix = np.random.randn( + len(ad_id_tokens)+1, embedding_dim) + for ad_id in ad_id_tokens: + embedding_vector = wv[ad_id] + if embedding_vector is not None: + index = tokenizer.texts_to_sequences([ad_id])[0][0] + embedding_matrix[index] = embedding_vector + return embedding_matrix + + ad_id_emb = get_ad_id_emb() + + # 获取 product_id 特征 + # f = open('tmp/userid_product_ids.txt') + f = open('word2vec/userid_product_ids.txt') + tokenizer = Tokenizer(num_words=NUM_product_id) + tokenizer.fit_on_texts(f) + f.close() + product_id_seq = [] + with open('word2vec/userid_product_ids.txt') as f: + for text in f: + product_id_seq.append(text.strip()) + + sequences = tokenizer.texts_to_sequences(product_id_seq[:900000//1]) + X3_train = pad_sequences( + sequences, maxlen=LEN_product_id, padding='post') + + # 获取product_id embedding + def get_product_id_emb(): + path = "word2vec/wordvectors_product_id.kv" + wv = KeyedVectors.load(path, mmap='r') + product_id_tokens = list(wv.vocab.keys()) + embedding_dim = 128 + embedding_matrix = np.random.randn( + len(product_id_tokens)+1, embedding_dim) + for product_id in product_id_tokens: + embedding_vector = wv[product_id] + if embedding_vector is not None: + index = tokenizer.texts_to_sequences([product_id])[0][0] + embedding_matrix[index] = embedding_vector + return embedding_matrix + + product_id_emb = get_product_id_emb() + + # 获得gender标签 + user_train = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) + Y_gender = user_train['gender'].values + Y_age = user_train['age'].values + Y_gender = Y_gender - 1 + num_examples = Y_gender.shape[0] + train_examples = int(num_examples * 0.9) + + # 分别对应 x1_train x1_val x2_train x2_val y_train y_val + return X1_train[:train_examples], X1_train[train_examples:], X2_train[:train_examples], X2_train[train_examples:], X3_train[:train_examples], X3_train[train_examples:], Y_gender[:train_examples], Y_gender[train_examples:], creative_id_emb, ad_id_emb, product_id_emb + +# %% + + +def get_test(): + pass + + +# %% +LEN_creative_id = 100 +LEN_ad_id = 100 +LEN_product_id = 100 + + +def get_gender_model(creative_id_emb, ad_id_emb, product_id_emb): + # shape:(sequence长度, ) + # first input + input_creative_id = Input(shape=(None,), name='creative_id') + x1 = Embedding(input_dim=NUM_creative_id, + output_dim=128, + weights=[creative_id_emb], + trainable=True, + input_length=LEN_creative_id, + mask_zero=True)(input_creative_id) + x1 = LSTM(1024, return_sequences=True)(x1) + x1 = LSTM(512, return_sequences=True)(x1) + x1 = LSTM(256, return_sequences=False)(x1) + + # second input + input_ad_id = Input(shape=(None,), name='ad_id') + x2 = Embedding(input_dim=NUM_ad_id, + output_dim=128, + weights=[ad_id_emb], + trainable=True, + input_length=LEN_ad_id, + mask_zero=True)(input_ad_id) + x2 = LSTM(1024, return_sequences=True)(x2) + x2 = LSTM(512, return_sequences=True)(x2) + x2 = LSTM(256, return_sequences=False)(x2) + + # third input + input_product_id = Input(shape=(None,), name='product_id') + x3 = Embedding(input_dim=NUM_product_id, + output_dim=128, + weights=[product_id_emb], + trainable=True, + input_length=LEN_product_id, + mask_zero=True)(input_product_id) + x3 = LSTM(1024, return_sequences=True)(x3) + x3 = LSTM(512, return_sequences=True)(x3) + x3 = LSTM(256, return_sequences=False)(x3) + + # concat x1 x2 + x = concatenate([x1, x2, x3]) + x = Dense(128)(x) + x = Dropout(0.1)(x) + output_y = Dense(1, activation='sigmoid')(x) + + model = Model([input_creative_id, input_ad_id, input_product_id], output_y) + model.compile(loss='binary_crossentropy', + optimizer='adam', metrics=['accuracy']) + model.summary() + + return model + + +# %% +mail('start getting train data') +x1_train, x1_val, x2_train, x2_val, x3_train, x3_val, y_train, y_val, creative_id_emb, ad_id_emb, product_id_emb = get_train_val() +mail('get train data done.') + +model = get_gender_model(creative_id_emb, ad_id_emb, product_id_emb) +# %% +# %% +# 测试数据格式(batch_size, sequence长度) +# x1 = np.array([1, 2, 3, 4]).reshape(1, -1) +# x2 = np.array([1, 2, 3, 4]).reshape(1, -1) +# model.predict([x1, x2]) + + +# %% +checkpoint = ModelCheckpoint("tmp/gender_epoch_{epoch:02d}.hdf5", monitor='val_loss', verbose=1, + save_best_only=False, mode='auto', period=1) +# %% +# model.fit( +# {'creative_id': x1_train, 'ad_id': x2_train}, +# y_train, +# validation_data=([x1_val, x2_val], y_val), +# epochs=5, +# batch_size=256, +# callbacks=[checkpoint], +# ) + +# %% +try: + mail('start train lstm') + model.fit( + {'creative_id': x1_train, 'ad_id': x2_train, 'product_id': x3_train}, + y_train, + validation_data=([x1_val, x2_val, x3_val], y_val), + epochs=3, + batch_size=256, + callbacks=[checkpoint], + ) + mail('train gender lstm done!!!') +except Exception as e: + e = str(e) + mail('train lstm failed!!! ' + e) + + +# %% +# model.load_weights('tmp\gender_epoch_01.hdf5') + + +# # %% +# if debug: +# sequences = tokenizer.texts_to_sequences( +# creative_id_seq[900000:]) +# else: +# sequences = tokenizer.texts_to_sequences( +# creative_id_seq[900000:]) + +# X_test = pad_sequences(sequences, maxlen=LEN_creative_id) +# # %% +# y_pred = model.predict(X_test, batch_size=4096) + +# y_pred = np.where(y_pred > 0.5, 1, 0) +# y_pred = y_pred.flatten() + +# # %% +# y_pred = y_pred+1 +# # %% +# res = pd.DataFrame({'predicted_gender': y_pred}) +# res.to_csv( +# 'data/ans/lstm_gender.csv', header=True, columns=['predicted_gender'], index=False) + + +# # %% +# mail('predict lstm gender done') + +# %% diff --git a/test.py b/test.py new file mode 100644 index 0000000..bca50c0 --- /dev/null +++ b/test.py @@ -0,0 +1,98 @@ +# %% +import random +import unittest + +from transformers import is_torch_available + +import transformers +from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup +import torch + +import numpy as np +import pandas as pd +import seaborn as sns +from pylab import rcParams +import matplotlib.pyplot as plt +from matplotlib import rc +from sklearn.model_selection import train_test_split +from sklearn.metrics import confusion_matrix, classification_report +from collections import defaultdict +from textwrap import wrap + +from torch import nn, optim +from torch.utils.data import Dataset, DataLoader +import torch.nn.functional as F + +import pandas as pd + +if is_torch_available(): + from transformers import ( + BertConfig, + BertModel, + BertForMaskedLM, + BertForNextSentencePrediction, + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + BertForTokenClassification, + BertForMultipleChoice, + ) + from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST + + +# %% +PRE_TRAINED_MODEL_NAME = 'bert-base-cased' +EPOCHS = 10 + + +# %% +creative_id_seq = [] +cnt = 0 +with open('word2vec/userid_creative_ids.txt', 'r') as f: + for text in f: + creative_id_seq.append(text.strip()) + cnt += 1 + if cnt == 90: + break +with open('tmp/tmp.txt', 'w')as f: + f.write('[PAD]\n[UNK]\n[CLS]\n[SEP]\n') + s = set() + for seq in creative_id_seq: + seq = seq.split(' ') + s = s | set(seq) + for e in s: + f.write(str(e)+'\n') + + +# %% +user_train = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) +Y_gender = user_train['gender'].values +Y_age = user_train['age'].values +Y_gender = Y_gender - 1 +Y_age = Y_age - 1 +# Y_age = to_categorical(Y_age) + + +# %% +tokenizer = BertTokenizer('tmp/tmp.txt') +print(tokenizer.get_vocab()) +sample_txt = '456 1 23 456 89 89' +# tokenizer.tokenize(sample_txt) + + +# %% + +encoding = tokenizer.encode_plus( + sample_txt, + max_length=32, + add_special_tokens=True, # Add '[CLS]' and '[SEP]' + return_token_type_ids=False, + pad_to_max_length=True, + return_attention_mask=True, + return_tensors='pt', # Return PyTorch tensors +) +# encoding.keys() +# encoding['input_ids'] +# encoding['attention_mask'] +# tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])