From e55be5c587e50ccd7e4589b99a826701afc8c44e Mon Sep 17 00:00:00 2001 From: sunlanchang Date: Mon, 22 Jun 2020 04:50:22 +0800 Subject: [PATCH] update predict transformer --- Transformer_keras_6_input.py | 146 ++++--- Transformer_keras_6_input_predict.py | 590 +++++++++++++++++++++++++++ 2 files changed, 690 insertions(+), 46 deletions(-) create mode 100644 Transformer_keras_6_input_predict.py diff --git a/Transformer_keras_6_input.py b/Transformer_keras_6_input.py index 08bf946..4b82aa6 100644 --- a/Transformer_keras_6_input.py +++ b/Transformer_keras_6_input.py @@ -46,6 +46,7 @@ default=False) parser.add_argument('--not_train_embedding', action='store_false', help='从npy文件加载数据', + default=True) parser.add_argument('--gender', action='store_true', help='gender model', @@ -53,12 +54,16 @@ parser.add_argument('--age', action='store_true', help='age model', default=False) + parser.add_argument('--batch_size', type=int, help='batch size大小', default=256) parser.add_argument('--epoch', type=int, help='epoch 大小', default=5) +parser.add_argument('--predict', action='store_true', + help='从npy文件加载数据', + default=False) parser.add_argument('--num_transformer', type=int, help='transformer层数', @@ -229,7 +234,7 @@ def get_age_model(DATA): input_product_id = Input(shape=(max_seq_len,), name='product_id') x3 = Embedding(input_dim=NUM_product_id+1, - output_dim=256, + output_dim=32, weights=[DATA['product_id_emb']], trainable=args.not_train_embedding, # trainable=False, @@ -238,7 +243,7 @@ def get_age_model(DATA): input_advertiser_id = Input(shape=(max_seq_len,), name='advertiser_id') x4 = Embedding(input_dim=NUM_advertiser_id+1, - output_dim=256, + output_dim=64, weights=[DATA['advertiser_id_emb']], trainable=args.not_train_embedding, # trainable=False, @@ -247,7 +252,7 @@ def get_age_model(DATA): input_industry = Input(shape=(max_seq_len,), name='industry') x5 = Embedding(input_dim=NUM_industry+1, - output_dim=256, + output_dim=16, weights=[DATA['industry_emb']], trainable=args.not_train_embedding, # trainable=False, @@ -257,7 +262,7 @@ def get_age_model(DATA): input_product_category = Input( shape=(max_seq_len,), name='product_category') x6 = Embedding(input_dim=NUM_product_category+1, - output_dim=256, + output_dim=8, weights=[DATA['product_category_emb']], trainable=args.not_train_embedding, # trainable=False, @@ -313,23 +318,6 @@ def get_age_model(DATA): def get_train_val(): - # 提取词向量文件 - def get_embedding(feature_name, tokenizer): - path = f'word2vec_new/{feature_name}.kv' - wv = KeyedVectors.load(path, mmap='r') - feature_tokens = list(wv.vocab.keys()) - feature_name_dict = {'creative_id': 256, 'ad_id': 256, 'advertiser_id': 64, - 'product_id': 32, 'product_category': 8, 'industry': 16} - embedding_dim = feature_name_dict[feature_name] - embedding_matrix = np.random.randn( - len(feature_tokens)+1, embedding_dim) - for feature in feature_tokens: - embedding_vector = wv[feature] - if embedding_vector is not None: - index = tokenizer.texts_to_sequences([feature])[0][0] - embedding_matrix[index] = embedding_vector - return embedding_matrix - # 从序列文件提取array格式数据 def get_train(feature_name, vocab_size, len_feature): f = open(f'word2vec_new/{feature_name}.txt') @@ -342,15 +330,38 @@ def get_train(feature_name, vocab_size, len_feature): for text in f: feature_seq.append(text.strip()) - sequences = tokenizer.texts_to_sequences(feature_seq[:900000//1]) + sequences = tokenizer.texts_to_sequences(feature_seq[:900000]) X_train = pad_sequences( sequences, maxlen=len_feature, padding='post') - return X_train, tokenizer - # 构造输出的训练标签 - # 获得age、gender标签 + sequences = tokenizer.texts_to_sequences(feature_seq[900000:]) + X_test = pad_sequences( + sequences, maxlen=len_feature, padding='post') + return X_train, tokenizer, X_test + + # 提取词向量文件 + def get_embedding(feature_name, tokenizer): + path = f'word2vec_new/{feature_name}.kv' + wv = KeyedVectors.load(path, mmap='r') + feature_tokens = list(wv.vocab.keys()) + feature_name_dict = {'creative_id': 256, 'ad_id': 256, 'advertiser_id': 64, + 'product_id': 32, 'product_category': 8, 'industry': 16} + embedding_dim = feature_name_dict[feature_name] + embedding_matrix = np.random.randn( + len(feature_tokens)+1, embedding_dim) + for word, i in tokenizer.word_index.items(): + embedding_vector = wv[word] + if embedding_vector is not None: + embedding_matrix[i] = embedding_vector + else: + print(str(word)+' 没有找到') + return embedding_matrix + DATA = {} + # 获取test数据 + # 构造输出的训练标签 + # 获得age、gender标签 user_train = pd.read_csv( 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) Y_gender = user_train['gender'].values @@ -370,62 +381,69 @@ def get_train(feature_name, vocab_size, len_feature): # 第一个输入 print('获取 creative_id 特征') - X1_train, tokenizer = get_train( + X1_train, tokenizer, X1_test = get_train( 'creative_id', NUM_creative_id+1, LEN_creative_id) # +1为了UNK的creative_id creative_id_emb = get_embedding('creative_id', tokenizer) DATA['X1_train'] = X1_train[:train_examples] DATA['X1_val'] = X1_train[train_examples:] + DATA['X1_test'] = X1_test DATA['creative_id_emb'] = creative_id_emb # 第二个输入 print('获取 ad_id 特征') - X2_train, tokenizer = get_train( + X2_train, tokenizer, X2_test = get_train( 'ad_id', NUM_ad_id+1, LEN_ad_id) ad_id_emb = get_embedding('ad_id', tokenizer) DATA['X2_train'] = X2_train[:train_examples] DATA['X2_val'] = X2_train[train_examples:] + DATA['X2_test'] = X2_test + DATA['ad_id_emb'] = ad_id_emb # 第三个输入 print('获取 product_id 特征') - X3_train, tokenizer = get_train( + X3_train, tokenizer, X3_test = get_train( 'product_id', NUM_product_id+1, LEN_product_id) product_id_emb = get_embedding('product_id', tokenizer) DATA['X3_train'] = X3_train[:train_examples] DATA['X3_val'] = X3_train[train_examples:] + DATA['X3_test'] = X3_test DATA['product_id_emb'] = product_id_emb # 第四个输入 print('获取 advertiser_id 特征') - X4_train, tokenizer = get_train( + X4_train, tokenizer, X4_test = get_train( 'advertiser_id', NUM_advertiser_id+1, LEN_advertiser_id) advertiser_id_emb = get_embedding('advertiser_id', tokenizer) DATA['X4_train'] = X4_train[:train_examples] DATA['X4_val'] = X4_train[train_examples:] + DATA['X4_test'] = X4_test DATA['advertiser_id_emb'] = advertiser_id_emb # 第五个输入 print('获取 industry 特征') - X5_train, tokenizer = get_train( + X5_train, tokenizer, X5_test = get_train( 'industry', NUM_industry+1, LEN_industry) industry_emb = get_embedding('industry', tokenizer) DATA['X5_train'] = X5_train[:train_examples] DATA['X5_val'] = X5_train[train_examples:] + DATA['X5_test'] = X5_test DATA['industry_emb'] = industry_emb # 第六个输入 print('获取 product_category 特征') - X6_train, tokenizer = get_train( + X6_train, tokenizer, X6_test = get_train( 'product_category', NUM_product_category+1, LEN_product_category) product_category_emb = get_embedding('product_category', tokenizer) DATA['X6_train'] = X6_train[:train_examples] DATA['X6_val'] = X6_train[train_examples:] + DATA['X6_test'] = X6_test DATA['product_category_emb'] = product_category_emb return DATA @@ -448,6 +466,12 @@ def save_npy(datas, name): np.save(f'tmp/{name}_{i}.npy', data) print(f'saving tmp/{name}_{i}.npy') + test = [DATA['X1_test'], + DATA['X2_test'], + DATA['X3_test'], + DATA['X4_test'], + DATA['X5_test'], + DATA['X6_test'], ] inputs = [ DATA['X1_train'], DATA['X1_val'], DATA['X2_train'], DATA['X2_val'], @@ -466,6 +490,7 @@ def save_npy(datas, name): DATA['industry_emb'], DATA['product_category_emb'], ] + save_npy(test, 'test') save_npy(inputs, 'inputs') save_npy(outputs_gender, 'gender') save_npy(outputs_age, 'age') @@ -501,6 +526,13 @@ def save_npy(datas, name): DATA['product_category_emb'] = np.load( 'tmp/embeddings_5.npy', allow_pickle=True) + DATA['X_test1'] = np.load('tmp/test_0.npy', allow_pickle=True) + DATA['X_test2'] = np.load('tmp/test_1.npy', allow_pickle=True) + DATA['X_test3'] = np.load('tmp/test_2.npy', allow_pickle=True) + DATA['X_test4'] = np.load('tmp/test_3.npy', allow_pickle=True) + DATA['X_test5'] = np.load('tmp/test_4.npy', allow_pickle=True) + DATA['X_test6'] = np.load('tmp/test_5.npy', allow_pickle=True) + # %% @@ -624,21 +656,43 @@ def save_npy(datas, name): # mail('train failed!!! ' + e) print(e) # %% -# model.load_weights('tmp/gender_epoch_01.hdf5') - - -# # %% -# if debug: -# sequences = tokenizer.texts_to_sequences( -# creative_id_seq[900000:]) -# else: -# sequences = tokenizer.texts_to_sequences( -# creative_id_seq[900000:]) - -# X_test = pad_sequences(sequences, maxlen=LEN_creative_id) -# # %% -# y_pred = model.predict(X_test, batch_size=4096) - +if args.predict: + model.load_weights('tmp/gender_epoch_01.hdf5') + y_pred = model.predict( + { + 'creative_id': DATA['X1_test'], + 'ad_id': DATA['X2_test'], + 'product_id': DATA['X3_test'], + 'advertiser_id': DATA['X4_test'], + 'industry': DATA['X5_test'], + 'product_category': DATA['X6_test'] + }, + batch_size=1024, + ) + y_pred = np.argmax(y_pred, axis=1) + y_pred = y_pred.flatten() + y_pred += 1 + + if args.gender: + ans = pd.DataFrame({'predicted_gender': y_pred}) + ans.to_csv( + 'data/ans/transformer_gender.csv', header=True, columns=['predicted_gender'], index=False) + elif args.age: + ans = pd.DataFrame({'predicted_age': y_pred}) + ans.to_csv( + 'data/ans/transformer_age.csv', header=True, columns=['predicted_age'], index=False) + + user_id_test = pd.read_csv( + 'data/test/clicklog_ad.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique() + ans = pd.DataFrame({'user_id': user_id_test}) + + gender = pd.read_csv('data/ans/transformer_gender.csv') + age = pd.read_csv('data/ans/transformer_age.csv') + ans['predicted_gender'] = gender.predicted_gender + ans['predicted_age'] = age.predicted_age + ans.to_csv('data/ans/submission.csv', header=True, index=False, + columns=['user_id', 'predicted_age', 'predicted_gender']) +# %% # y_pred = np.where(y_pred > 0.5, 1, 0) # y_pred = y_pred.flatten() diff --git a/Transformer_keras_6_input_predict.py b/Transformer_keras_6_input_predict.py new file mode 100644 index 0000000..c92e8cb --- /dev/null +++ b/Transformer_keras_6_input_predict.py @@ -0,0 +1,590 @@ +# %% +# 生成词嵌入文件 +from layers import Add, LayerNormalization +from layers import MultiHeadAttention, PositionWiseFeedForward +from layers import PositionEncoding +from tensorflow.keras.callbacks import Callback +import tensorflow.keras.backend as K +from mymail import mail +from gensim.models import Word2Vec, KeyedVectors +from tensorflow.keras.utils import to_categorical +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.models import Model, Sequential +import os +from tqdm import tqdm +import numpy as np +import pandas as pd +import argparse +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers +from tensorflow.keras import losses +from tensorflow.keras import optimizers +from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler +from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, Concatenate, Bidirectional, GlobalMaxPooling1D +from tensorflow.keras.models import Model, Sequential +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.utils import to_categorical +from gensim.models import Word2Vec, KeyedVectors +from mymail import mail + + +tf.config.experimental_run_functions_eagerly(True) + +# os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +''' +python Transformer_keras.py --load_from_npy --batch_size 256 --epoch 5 --num_transformer 1 --head_attention 1 --num_lstm 1 --examples 100000 +''' + +# %% +parser = argparse.ArgumentParser() +parser.add_argument('--load_from_npy', action='store_true', + help='从npy文件加载数据', + default=False) +parser.add_argument('--not_train_embedding', action='store_false', + help='从npy文件加载数据', + + default=True) +parser.add_argument('--gender', action='store_true', + help='gender model', + default=False) +parser.add_argument('--age', action='store_true', + help='age model', + default=False) + +parser.add_argument('--batch_size', type=int, + help='batch size大小', + default=256) +parser.add_argument('--epoch', type=int, + help='epoch 大小', + default=5) +parser.add_argument('--predict', action='store_true', + help='从npy文件加载数据', + default=False) + +parser.add_argument('--num_transformer', type=int, + help='transformer层数', + default=1) +parser.add_argument('--head_attention', type=int, + help='transformer head个数', + default=1) + +parser.add_argument('--train_examples', type=int, + help='训练数据,默认为训练集,不包含验证集,调试时候可以设置1000', + default=810000) +parser.add_argument('--val_examples', type=int, + help='验证集数据,调试时候可以设置1000', + default=90000) +args = parser.parse_args() +# %% +NUM_creative_id = 3412772 +NUM_ad_id = 3027360 +NUM_product_id = 39057 +NUM_advertiser_id = 57870 +NUM_industry = 332 +NUM_product_category = 18 + +LEN_creative_id = 150 +LEN_ad_id = 150 +LEN_product_id = 150 +LEN_advertiser_id = 150 +LEN_industry = 150 +LEN_product_category = 150 + +# %% + + +def get_gender_model(DATA): + + feed_forward_size = 2048 + max_seq_len = 100 + model_dim = 128*6 + + input_creative_id = Input(shape=(max_seq_len,), name='creative_id') + x1 = Embedding(input_dim=NUM_creative_id+1, + output_dim=128, + weights=[DATA['creative_id_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=100, + mask_zero=True)(input_creative_id) + # encodings = PositionEncoding(model_dim)(x1) + # encodings = Add()([embeddings, encodings]) + + input_ad_id = Input(shape=(max_seq_len,), name='ad_id') + x2 = Embedding(input_dim=NUM_ad_id+1, + output_dim=128, + weights=[DATA['ad_id_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=100, + mask_zero=True)(input_ad_id) + + input_product_id = Input(shape=(max_seq_len,), name='product_id') + x3 = Embedding(input_dim=NUM_product_id+1, + output_dim=128, + weights=[DATA['product_id_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=100, + mask_zero=True)(input_product_id) + + input_advertiser_id = Input(shape=(max_seq_len,), name='advertiser_id') + x4 = Embedding(input_dim=NUM_advertiser_id+1, + output_dim=128, + weights=[DATA['advertiser_id_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=100, + mask_zero=True)(input_advertiser_id) + + input_industry = Input(shape=(max_seq_len,), name='industry') + x5 = Embedding(input_dim=NUM_industry+1, + output_dim=128, + weights=[DATA['industry_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=100, + mask_zero=True)(input_industry) + + input_product_category = Input( + shape=(max_seq_len,), name='product_category') + x6 = Embedding(input_dim=NUM_product_category+1, + output_dim=128, + weights=[DATA['product_category_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=100, + mask_zero=True)(input_product_category) + + # (bs, 100, 128*2) + encodings = layers.Concatenate(axis=2)([x1, x2, x3, x4, x5, x6]) + # (bs, 100) + masks = tf.equal(input_creative_id, 0) + + # (bs, 100, 128*2) + attention_out = MultiHeadAttention(8, 96)( + [encodings, encodings, encodings, masks]) + + # Add & Norm + attention_out += encodings + attention_out = LayerNormalization()(attention_out) + # Feed-Forward + ff = PositionWiseFeedForward(model_dim, feed_forward_size) + ff_out = ff(attention_out) + # Add & Norm + # ff_out (bs, 100, 128),但是attention_out是(bs,100,256) + ff_out += attention_out + encodings = LayerNormalization()(ff_out) + encodings = GlobalMaxPooling1D()(encodings) + encodings = Dropout(0.2)(encodings) + + output_gender = Dense(2, activation='softmax', name='gender')(encodings) + # output_age = Dense(10, activation='softmax', name='age')(encodings) + + model = Model( + inputs=[input_creative_id, + input_ad_id, + input_product_id, + input_advertiser_id, + input_industry, + input_product_category], + outputs=[output_gender] + ) + + model.compile( + optimizer=optimizers.Adam(2.5e-4), + loss={ + 'gender': losses.CategoricalCrossentropy(from_logits=False), + # 'age': losses.CategoricalCrossentropy(from_logits=False) + }, + # loss_weights=[0.4, 0.6], + metrics=['accuracy']) + return model + + +def get_age_model(DATA): + + feed_forward_size = 2048 + max_seq_len = 150 + model_dim = 256+256+64+32+8+16 + + input_creative_id = Input(shape=(max_seq_len,), name='creative_id') + x1 = Embedding(input_dim=NUM_creative_id+1, + output_dim=256, + weights=[DATA['creative_id_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=max_seq_len, + mask_zero=True)(input_creative_id) + # encodings = PositionEncoding(model_dim)(x1) + # encodings = Add()([embeddings, encodings]) + + input_ad_id = Input(shape=(max_seq_len,), name='ad_id') + x2 = Embedding(input_dim=NUM_ad_id+1, + output_dim=256, + weights=[DATA['ad_id_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=max_seq_len, + mask_zero=True)(input_ad_id) + + input_product_id = Input(shape=(max_seq_len,), name='product_id') + x3 = Embedding(input_dim=NUM_product_id+1, + output_dim=32, + weights=[DATA['product_id_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=max_seq_len, + mask_zero=True)(input_product_id) + + input_advertiser_id = Input(shape=(max_seq_len,), name='advertiser_id') + x4 = Embedding(input_dim=NUM_advertiser_id+1, + output_dim=64, + weights=[DATA['advertiser_id_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=max_seq_len, + mask_zero=True)(input_advertiser_id) + + input_industry = Input(shape=(max_seq_len,), name='industry') + x5 = Embedding(input_dim=NUM_industry+1, + output_dim=16, + weights=[DATA['industry_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=max_seq_len, + mask_zero=True)(input_industry) + + input_product_category = Input( + shape=(max_seq_len,), name='product_category') + x6 = Embedding(input_dim=NUM_product_category+1, + output_dim=8, + weights=[DATA['product_category_emb']], + trainable=args.not_train_embedding, + # trainable=False, + input_length=max_seq_len, + mask_zero=True)(input_product_category) + + # (bs, 100, 128*2) + encodings = layers.Concatenate(axis=2)([x1, x2, x3, x4, x5, x6]) + # (bs, 100) + masks = tf.equal(input_creative_id, 0) + + # (bs, 100, 128*2) + # concat之后是632 + attention_out = MultiHeadAttention(8, 79)( + [encodings, encodings, encodings, masks]) + + # Add & Norm + attention_out += encodings + attention_out = LayerNormalization()(attention_out) + # Feed-Forward + ff = PositionWiseFeedForward(model_dim, feed_forward_size) + ff_out = ff(attention_out) + # Add & Norm + # ff_out (bs, 100, 128),但是attention_out是(bs,100,256) + ff_out += attention_out + encodings = LayerNormalization()(ff_out) + encodings = GlobalMaxPooling1D()(encodings) + encodings = Dropout(0.2)(encodings) + + # output_gender = Dense(2, activation='softmax', name='gender')(encodings) + output_age = Dense(10, activation='softmax', name='age')(encodings) + + model = Model( + inputs=[input_creative_id, + input_ad_id, + input_product_id, + input_advertiser_id, + input_industry, + input_product_category], + outputs=[output_age] + ) + + model.compile( + optimizer=optimizers.Adam(2.5e-4), + loss={ + # 'gender': losses.CategoricalCrossentropy(from_logits=False), + 'age': losses.CategoricalCrossentropy(from_logits=False) + }, + # loss_weights=[0.4, 0.6], + metrics=['accuracy']) + return model + + +def get_train_val(): + + # 从序列文件提取array格式数据 + def get_train(feature_name, vocab_size, len_feature): + ######################################## + f = open(f'word2vec_new/{feature_name}.txt') + tokenizer = Tokenizer(num_words=vocab_size) + tokenizer.fit_on_texts(f) + f.close() + + feature_seq = [] + ######################################### + with open(f'word2vec_new/{feature_name}.txt') as f: + for text in f: + feature_seq.append(text.strip()) + + sequences = tokenizer.texts_to_sequences(feature_seq[:900000]) + X_train = pad_sequences( + sequences, maxlen=len_feature, padding='post') + + sequences = tokenizer.texts_to_sequences(feature_seq[900000:]) + X_test = pad_sequences( + sequences, maxlen=len_feature, padding='post') + return X_train, tokenizer, X_test + + # 提取词向量文件 + def get_embedding(feature_name, tokenizer): + ######################################## + path = f'word2vec_new/{feature_name}.kv' + wv = KeyedVectors.load(path, mmap='r') + feature_tokens = list(wv.vocab.keys()) + feature_name_dict = {'creative_id': 256, 'ad_id': 256, 'advertiser_id': 64, + 'product_id': 32, 'product_category': 8, 'industry': 16} + embedding_dim = feature_name_dict[feature_name] + embedding_matrix = np.random.randn( + len(feature_tokens)+1, embedding_dim) + for word, i in tokenizer.word_index.items(): + embedding_vector = wv[word] + if embedding_vector is not None: + embedding_matrix[i] = embedding_vector + else: + print(str(word)+' 没有找到') + return embedding_matrix + + DATA = {} + # 获取test数据 + + # 构造输出的训练标签 + # 获得age、gender标签 + ####################################################### + user_train = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) + Y_gender = user_train['gender'].values + Y_age = user_train['age'].values + Y_gender = Y_gender - 1 + Y_age = Y_age - 1 + Y_age = to_categorical(Y_age) + Y_gender = to_categorical(Y_gender) + + num_examples = Y_age.shape[0] + train_examples = int(num_examples * 0.9) + + DATA['Y_gender_train'] = Y_gender[:train_examples] + DATA['Y_gender_val'] = Y_gender[train_examples:] + DATA['Y_age_train'] = Y_age[:train_examples] + DATA['Y_age_val'] = Y_age[train_examples:] + + # 第一个输入 + print('获取 creative_id 特征') + X1_train, tokenizer, X1_test = get_train( + 'creative_id', NUM_creative_id+1, LEN_creative_id) # +1为了UNK的creative_id + creative_id_emb = get_embedding('creative_id', tokenizer) + + DATA['X1_train'] = X1_train[:train_examples] + DATA['X1_val'] = X1_train[train_examples:] + DATA['X1_test'] = X1_test + DATA['creative_id_emb'] = creative_id_emb + + # 第二个输入 + print('获取 ad_id 特征') + X2_train, tokenizer, X2_test = get_train( + 'ad_id', NUM_ad_id+1, LEN_ad_id) + ad_id_emb = get_embedding('ad_id', tokenizer) + + DATA['X2_train'] = X2_train[:train_examples] + DATA['X2_val'] = X2_train[train_examples:] + DATA['X2_test'] = X2_test + + DATA['ad_id_emb'] = ad_id_emb + + # 第三个输入 + print('获取 product_id 特征') + X3_train, tokenizer, X3_test = get_train( + 'product_id', NUM_product_id+1, LEN_product_id) + product_id_emb = get_embedding('product_id', tokenizer) + + DATA['X3_train'] = X3_train[:train_examples] + DATA['X3_val'] = X3_train[train_examples:] + DATA['X3_test'] = X3_test + DATA['product_id_emb'] = product_id_emb + + # 第四个输入 + print('获取 advertiser_id 特征') + X4_train, tokenizer, X4_test = get_train( + 'advertiser_id', NUM_advertiser_id+1, LEN_advertiser_id) + advertiser_id_emb = get_embedding('advertiser_id', tokenizer) + + DATA['X4_train'] = X4_train[:train_examples] + DATA['X4_val'] = X4_train[train_examples:] + DATA['X4_test'] = X4_test + DATA['advertiser_id_emb'] = advertiser_id_emb + + # 第五个输入 + print('获取 industry 特征') + X5_train, tokenizer, X5_test = get_train( + 'industry', NUM_industry+1, LEN_industry) + industry_emb = get_embedding('industry', tokenizer) + + DATA['X5_train'] = X5_train[:train_examples] + DATA['X5_val'] = X5_train[train_examples:] + DATA['X5_test'] = X5_test + DATA['industry_emb'] = industry_emb + + # 第六个输入 + print('获取 product_category 特征') + X6_train, tokenizer, X6_test = get_train( + 'product_category', NUM_product_category+1, LEN_product_category) + product_category_emb = get_embedding('product_category', tokenizer) + + DATA['X6_train'] = X6_train[:train_examples] + DATA['X6_val'] = X6_train[train_examples:] + DATA['X6_test'] = X6_test + DATA['product_category_emb'] = product_category_emb + + return DATA + + +# %% +if not args.load_from_npy: + mail('start getting train data') + print('从csv文件提取训练数据到array格式,大概十几分钟时间') + DATA = get_train_val() + mail('get train data done.') + + # 训练数据保存为npy文件 + dirs = 'tmp/' + if not os.path.exists(dirs): + os.makedirs(dirs) + + def save_npy(datas, name): + for i, data in enumerate(datas): + np.save(f'tmp/{name}_{i}.npy', data) + print(f'saving tmp/{name}_{i}.npy') + + test = [DATA['X1_test'], + DATA['X2_test'], + DATA['X3_test'], + DATA['X4_test'], + DATA['X5_test'], + DATA['X6_test'], ] + inputs = [ + DATA['X1_train'], DATA['X1_val'], + DATA['X2_train'], DATA['X2_val'], + DATA['X3_train'], DATA['X3_val'], + DATA['X4_train'], DATA['X4_val'], + DATA['X5_train'], DATA['X5_val'], + DATA['X6_train'], DATA['X6_val'], + ] + outputs_gender = [DATA['Y_gender_train'], DATA['Y_gender_val']] + outputs_age = [DATA['Y_age_train'], DATA['Y_age_val']] + embeddings = [ + DATA['creative_id_emb'], + DATA['ad_id_emb'], + DATA['product_id_emb'], + DATA['advertiser_id_emb'], + DATA['industry_emb'], + DATA['product_category_emb'], + ] + save_npy(test, 'test') + save_npy(inputs, 'inputs') + save_npy(outputs_gender, 'gender') + save_npy(outputs_age, 'age') + save_npy(embeddings, 'embeddings') +else: + DATA = {} + DATA['X1_train'] = np.load('tmp/inputs_0.npy', allow_pickle=True) + DATA['X1_val'] = np.load('tmp/inputs_1.npy', allow_pickle=True) + DATA['X2_train'] = np.load('tmp/inputs_2.npy', allow_pickle=True) + DATA['X2_val'] = np.load('tmp/inputs_3.npy', allow_pickle=True) + DATA['X3_train'] = np.load('tmp/inputs_4.npy', allow_pickle=True) + DATA['X3_val'] = np.load('tmp/inputs_5.npy', allow_pickle=True) + DATA['X4_train'] = np.load('tmp/inputs_6.npy', allow_pickle=True) + DATA['X4_val'] = np.load('tmp/inputs_7.npy', allow_pickle=True) + DATA['X5_train'] = np.load('tmp/inputs_8.npy', allow_pickle=True) + DATA['X5_val'] = np.load('tmp/inputs_9.npy', allow_pickle=True) + DATA['X6_train'] = np.load('tmp/inputs_10.npy', allow_pickle=True) + DATA['X6_val'] = np.load('tmp/inputs_11.npy', allow_pickle=True) + DATA['Y_gender_train'] = np.load('tmp/gender_0.npy', allow_pickle=True) + DATA['Y_gender_val'] = np.load('tmp/gender_1.npy', allow_pickle=True) + DATA['Y_age_train'] = np.load('tmp/age_0.npy', allow_pickle=True) + DATA['Y_age_val'] = np.load('tmp/age_1.npy', allow_pickle=True) + DATA['creative_id_emb'] = np.load( + 'tmp/embeddings_0.npy', allow_pickle=True) + DATA['ad_id_emb'] = np.load( + 'tmp/embeddings_1.npy', allow_pickle=True) + DATA['product_id_emb'] = np.load( + 'tmp/embeddings_2.npy', allow_pickle=True) + DATA['advertiser_id_emb'] = np.load( + 'tmp/embeddings_3.npy', allow_pickle=True) + DATA['industry_emb'] = np.load( + 'tmp/embeddings_4.npy', allow_pickle=True) + DATA['product_category_emb'] = np.load( + 'tmp/embeddings_5.npy', allow_pickle=True) + + DATA['X_test1'] = np.load('tmp/test_0.npy', allow_pickle=True) + DATA['X_test2'] = np.load('tmp/test_1.npy', allow_pickle=True) + DATA['X_test3'] = np.load('tmp/test_2.npy', allow_pickle=True) + DATA['X_test4'] = np.load('tmp/test_3.npy', allow_pickle=True) + DATA['X_test5'] = np.load('tmp/test_4.npy', allow_pickle=True) + DATA['X_test6'] = np.load('tmp/test_5.npy', allow_pickle=True) + + +# %% + +# # %% +# %% +if args.gender: + model = get_gender_model(DATA) +if args.age: + model = get_age_model(DATA) + +############################################## +model.load_weights('tmp/gender_epoch_01.hdf5') + +y_pred = model.predict( + { + 'creative_id': DATA['X1_test'], + 'ad_id': DATA['X2_test'], + 'product_id': DATA['X3_test'], + 'advertiser_id': DATA['X4_test'], + 'industry': DATA['X5_test'], + 'product_category': DATA['X6_test'] + }, + batch_size=1024, +) +y_pred = np.argmax(y_pred, axis=1) +y_pred = y_pred.flatten() +y_pred += 1 + +if args.gender: + ans = pd.DataFrame({'predicted_gender': y_pred}) + ################################################ + ans.to_csv( + 'data/ans/transformer_gender.csv', header=True, columns=['predicted_gender'], index=False) +elif args.age: + ans = pd.DataFrame({'predicted_age': y_pred}) + ################################################ + ans.to_csv( + 'data/ans/transformer_age.csv', header=True, columns=['predicted_age'], index=False) + + ############################################## + user_id_test = pd.read_csv( + 'data/test/clicklog_ad.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique() + ans = pd.DataFrame({'user_id': user_id_test}) + + ############################################## + gender = pd.read_csv('data/ans/transformer_gender.csv') + age = pd.read_csv('data/ans/transformer_age.csv') + ans['predicted_gender'] = gender.predicted_gender + ans['predicted_age'] = age.predicted_age + ans.to_csv('data/ans/submission.csv', header=True, index=False, + columns=['user_id', 'predicted_age', 'predicted_gender'])