From 01d8b2f18bdac88025ed8efd3e39becb0c2e71b4 Mon Sep 17 00:00:00 2001
From: sunlanchang <sunlanchang@outlook.com>
Date: Mon, 15 Jun 2020 11:57:45 +0800
Subject: [PATCH] update LSTM

---
 LSTM_age_multi_input.py        |  90 ++++-------
 LSTM_gender_multi_input.py     | 203 +++++++++++++-----------
 LSTM_gender_multi_input_old.py | 279 +++++++++++++++++++++++++++++++++
 test.py                        |  98 ++++++++++++
 4 files changed, 519 insertions(+), 151 deletions(-)
 create mode 100644 LSTM_gender_multi_input_old.py
 create mode 100644 test.py

diff --git a/LSTM_age_multi_input.py b/LSTM_age_multi_input.py
index 4359ec3..8721d2a 100644
--- a/LSTM_age_multi_input.py
+++ b/LSTM_age_multi_input.py
@@ -18,6 +18,7 @@
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 
 # %%
+# 统计creative_id序列的长度，只需要统计一次
 # f = open('word2vec/userid_creative_ids.txt')
 # LEN_creative_id = -1
 # for line in f:
@@ -27,10 +28,10 @@
 # %%
 parser = argparse.ArgumentParser()
 parser.add_argument('--load_from_npy', action='store_true',
-                    help='从npy文件加载数据',
+                    help='从npy文件加载训练数据，不用每次训练都重新生成array文件',
                     default=False)
 parser.add_argument('--not_train_embedding', action='store_false',
-                    help='从npy文件加载数据',
+                    help='不训练embedding文件，一般来说加上这个参数效果不太好',
                     default=True)
 
 parser.add_argument('--epoch', type=int,
@@ -40,12 +41,12 @@
                     help='batch size大小',
                     default=256)
 parser.add_argument('--examples', type=int,
-                    help='训练数据，默认为训练集，不包含验证集',
+                    help='训练数据，默认为训练集，不包含验证集，调试时候可以设置1000',
                     default=810000)
 
 
 parser.add_argument('--num_lstm', type=int,
-                    help='LSTM head个数',
+                    help='LSTM层数个数，目前结果3层比5层好用，1层还在做实验中...',
                     default=1)
 
 args = parser.parse_args()
@@ -57,6 +58,22 @@
 
 def get_train_val():
 
+    # 提取词向量文件
+    def get_embedding(feature_name):
+        path = f"word2vec/wordvectors_{feature_name}.kv"
+        wv = KeyedVectors.load(path, mmap='r')
+        feature_tokens = list(wv.vocab.keys())
+        embedding_dim = 128
+        embedding_matrix = np.random.randn(
+            len(feature_tokens)+1, embedding_dim)
+        for feature in feature_tokens:
+            embedding_vector = wv[feature]
+            if embedding_vector is not None:
+                index = tokenizer.texts_to_sequences([feature])[0][0]
+                embedding_matrix[index] = embedding_vector
+        return embedding_matrix
+
+    # 第一个输入
     # 获取 creative_id 特征
     # f = open('tmp/userid_creative_ids.txt')
     f = open('word2vec/userid_creative_ids.txt')
@@ -72,23 +89,9 @@ def get_train_val():
     X1_train = pad_sequences(
         sequences, maxlen=LEN_creative_id, padding='post')
 
-    # 获取creative_id embedding
-    def get_creative_id_emb():
-        path = "word2vec/wordvectors_creative_id.kv"
-        wv = KeyedVectors.load(path, mmap='r')
-        creative_id_tokens = list(wv.vocab.keys())
-        embedding_dim = 128
-        embedding_matrix = np.random.randn(
-            len(creative_id_tokens)+1, embedding_dim)
-        for creative_id in creative_id_tokens:
-            embedding_vector = wv[creative_id]
-            if embedding_vector is not None:
-                index = tokenizer.texts_to_sequences([creative_id])[0][0]
-                embedding_matrix[index] = embedding_vector
-        return embedding_matrix
-
-    creative_id_emb = get_creative_id_emb()
+    creative_id_emb = get_embedding(feature_name='creative_id')
 
+    # 第二个输入
     # 获取 ad_id 特征
     f = open('word2vec/userid_ad_ids.txt')
     tokenizer = Tokenizer(num_words=NUM_ad_id)
@@ -103,22 +106,9 @@ def get_creative_id_emb():
     X2_train = pad_sequences(
         sequences, maxlen=LEN_ad_id, padding='post')
 
-    def get_ad_id_emb():
-        path = "word2vec/wordvectors_ad_id.kv"
-        wv = KeyedVectors.load(path, mmap='r')
-        ad_id_tokens = list(wv.vocab.keys())
-        embedding_dim = 128
-        embedding_matrix = np.random.randn(
-            len(ad_id_tokens)+1, embedding_dim)
-        for ad_id in ad_id_tokens:
-            embedding_vector = wv[ad_id]
-            if embedding_vector is not None:
-                index = tokenizer.texts_to_sequences([ad_id])[0][0]
-                embedding_matrix[index] = embedding_vector
-        return embedding_matrix
-
-    ad_id_emb = get_ad_id_emb()
+    ad_id_emb = get_embedding(feature_name='ad_id')
 
+    # 第三个输入
     # 获取 product_id 特征
     # f = open('tmp/userid_product_ids.txt')
     f = open('word2vec/userid_product_ids.txt')
@@ -134,24 +124,10 @@ def get_ad_id_emb():
     X3_train = pad_sequences(
         sequences, maxlen=LEN_product_id, padding='post')
 
-    # 获取product_id embedding
-    def get_product_id_emb():
-        path = "word2vec/wordvectors_product_id.kv"
-        wv = KeyedVectors.load(path, mmap='r')
-        product_id_tokens = list(wv.vocab.keys())
-        embedding_dim = 128
-        embedding_matrix = np.random.randn(
-            len(product_id_tokens)+1, embedding_dim)
-        for product_id in product_id_tokens:
-            embedding_vector = wv[product_id]
-            if embedding_vector is not None:
-                index = tokenizer.texts_to_sequences([product_id])[0][0]
-                embedding_matrix[index] = embedding_vector
-        return embedding_matrix
-
-    product_id_emb = get_product_id_emb()
+    product_id_emb = get_embedding(feature_name='product_id')
 
-    # 获得age标签
+    # 构造输出的训练标签
+    # 获得age、gender标签
     user_train = pd.read_csv(
         'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,))
     Y_gender = user_train['gender'].values
@@ -268,15 +244,6 @@ def save_data(datas):
 # %%
 checkpoint = ModelCheckpoint("tmp/age_epoch_{epoch:02d}.hdf5", monitor='val_loss', verbose=1,
                              save_best_only=False, mode='auto', period=1)
-# %%
-# model.fit(
-#     {'creative_id': x1_train, 'ad_id': x2_train},
-#     y_train,
-#     validation_data=([x1_val, x2_val], y_val),
-#     epochs=5,
-#     batch_size=256,
-#     callbacks=[checkpoint],
-# )
 
 # %%
 try:
@@ -298,6 +265,7 @@ def save_data(datas):
 
 
 # %%
+# 后续为预测过程，暂时注释掉不使用但是不要删除
 # model.load_weights('tmp\gender_epoch_01.hdf5')
 
 
diff --git a/LSTM_gender_multi_input.py b/LSTM_gender_multi_input.py
index f465851..945f20a 100644
--- a/LSTM_gender_multi_input.py
+++ b/LSTM_gender_multi_input.py
@@ -1,28 +1,55 @@
 # %%
 # 生成词嵌入文件
-from tqdm import tqdm
+import os
+import tensorflow as tf
 import numpy as np
 import pandas as pd
+from tqdm import tqdm
 from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
-from gensim.models import Word2Vec, KeyedVectors
-from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, concatenate
+from tensorflow.keras import layers
+from tensorflow.keras.layers import Input, LSTM, Bidirectional, Embedding, Dense, Dropout, concatenate
 from tensorflow.keras.models import Model, Sequential
-import tensorflow as tf
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.utils import to_categorical
+from gensim.models import Word2Vec, KeyedVectors
 from mymail import mail
-import os
+import argparse
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 
 # %%
+# 统计creative_id序列的长度，只需要统计一次
 # f = open('word2vec/userid_creative_ids.txt')
 # LEN_creative_id = -1
 # for line in f:
 #     current_line_len = len(line.strip().split(' '))
 #     LEN_creative_id = max(LEN_creative_id, current_line_len)
 # f.close()
-
-
+# %%
+parser = argparse.ArgumentParser()
+parser.add_argument('--load_from_npy', action='store_true',
+                    help='从npy文件加载训练数据，不用每次训练都重新生成array文件',
+                    default=False)
+parser.add_argument('--not_train_embedding', action='store_false',
+                    help='不训练embedding文件，一般来说加上这个参数效果不太好',
+                    default=True)
+
+parser.add_argument('--epoch', type=int,
+                    help='epoch 大小',
+                    default=5)
+parser.add_argument('--batch_size', type=int,
+                    help='batch size大小',
+                    default=256)
+parser.add_argument('--examples', type=int,
+                    help='训练数据，默认为训练集，不包含验证集，调试时候可以设置1000',
+                    default=810000)
+
+
+parser.add_argument('--num_lstm', type=int,
+                    help='LSTM层数个数，目前结果3层比5层好用，1层还在做实验中...',
+                    default=1)
+
+args = parser.parse_args()
 # %%
 NUM_creative_id = 2481135+1
 NUM_ad_id = 2264190+1
@@ -31,6 +58,22 @@
 
 def get_train_val():
 
+    # 提取词向量文件
+    def get_embedding(feature_name):
+        path = f"word2vec/wordvectors_{feature_name}.kv"
+        wv = KeyedVectors.load(path, mmap='r')
+        feature_tokens = list(wv.vocab.keys())
+        embedding_dim = 128
+        embedding_matrix = np.random.randn(
+            len(feature_tokens)+1, embedding_dim)
+        for feature in feature_tokens:
+            embedding_vector = wv[feature]
+            if embedding_vector is not None:
+                index = tokenizer.texts_to_sequences([feature])[0][0]
+                embedding_matrix[index] = embedding_vector
+        return embedding_matrix
+
+    # 第一个输入
     # 获取 creative_id 特征
     # f = open('tmp/userid_creative_ids.txt')
     f = open('word2vec/userid_creative_ids.txt')
@@ -38,7 +81,7 @@ def get_train_val():
     tokenizer.fit_on_texts(f)
     f.close()
     creative_id_seq = []
-    with open('word2vec/userid_creative_ids.txt', 'r') as f:
+    with open('word2vec/userid_creative_ids.txt') as f:
         for text in f:
             creative_id_seq.append(text.strip())
 
@@ -46,23 +89,9 @@ def get_train_val():
     X1_train = pad_sequences(
         sequences, maxlen=LEN_creative_id, padding='post')
 
-    # 获取creative_id embedding
-    def get_creative_id_emb():
-        path = "word2vec/wordvectors_creative_id.kv"
-        wv = KeyedVectors.load(path, mmap='r')
-        creative_id_tokens = list(wv.vocab.keys())
-        embedding_dim = 128
-        embedding_matrix = np.random.randn(
-            len(creative_id_tokens)+1, embedding_dim)
-        for creative_id in creative_id_tokens:
-            embedding_vector = wv[creative_id]
-            if embedding_vector is not None:
-                index = tokenizer.texts_to_sequences([creative_id])[0][0]
-                embedding_matrix[index] = embedding_vector
-        return embedding_matrix
-
-    creative_id_emb = get_creative_id_emb()
+    creative_id_emb = get_embedding(feature_name='creative_id')
 
+    # 第二个输入
     # 获取 ad_id 特征
     f = open('word2vec/userid_ad_ids.txt')
     tokenizer = Tokenizer(num_words=NUM_ad_id)
@@ -77,22 +106,9 @@ def get_creative_id_emb():
     X2_train = pad_sequences(
         sequences, maxlen=LEN_ad_id, padding='post')
 
-    def get_ad_id_emb():
-        path = "word2vec/wordvectors_ad_id.kv"
-        wv = KeyedVectors.load(path, mmap='r')
-        ad_id_tokens = list(wv.vocab.keys())
-        embedding_dim = 128
-        embedding_matrix = np.random.randn(
-            len(ad_id_tokens)+1, embedding_dim)
-        for ad_id in ad_id_tokens:
-            embedding_vector = wv[ad_id]
-            if embedding_vector is not None:
-                index = tokenizer.texts_to_sequences([ad_id])[0][0]
-                embedding_matrix[index] = embedding_vector
-        return embedding_matrix
-
-    ad_id_emb = get_ad_id_emb()
+    ad_id_emb = get_embedding(feature_name='ad_id')
 
+    # 第三个输入
     # 获取 product_id 特征
     # f = open('tmp/userid_product_ids.txt')
     f = open('word2vec/userid_product_ids.txt')
@@ -108,30 +124,19 @@ def get_ad_id_emb():
     X3_train = pad_sequences(
         sequences, maxlen=LEN_product_id, padding='post')
 
-    # 获取product_id embedding
-    def get_product_id_emb():
-        path = "word2vec/wordvectors_product_id.kv"
-        wv = KeyedVectors.load(path, mmap='r')
-        product_id_tokens = list(wv.vocab.keys())
-        embedding_dim = 128
-        embedding_matrix = np.random.randn(
-            len(product_id_tokens)+1, embedding_dim)
-        for product_id in product_id_tokens:
-            embedding_vector = wv[product_id]
-            if embedding_vector is not None:
-                index = tokenizer.texts_to_sequences([product_id])[0][0]
-                embedding_matrix[index] = embedding_vector
-        return embedding_matrix
-
-    product_id_emb = get_product_id_emb()
+    product_id_emb = get_embedding(feature_name='product_id')
 
-    # 获得gender标签
+    # 构造输出的训练标签
+    # 获得age、gender标签
     user_train = pd.read_csv(
         'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,))
     Y_gender = user_train['gender'].values
     Y_age = user_train['age'].values
     Y_gender = Y_gender - 1
-    num_examples = Y_gender.shape[0]
+    Y_age = Y_age - 1
+    Y_age = to_categorical(Y_age)
+    Y_gender = to_categorical(Y_gender)
+    num_examples = Y_age.shape[0]
     train_examples = int(num_examples * 0.9)
 
     # 分别对应 x1_train x1_val x2_train x2_val y_train y_val
@@ -157,45 +162,45 @@ def get_gender_model(creative_id_emb, ad_id_emb, product_id_emb):
     x1 = Embedding(input_dim=NUM_creative_id,
                    output_dim=128,
                    weights=[creative_id_emb],
-                   trainable=True,
+                   trainable=args.not_train_embedding,
                    input_length=LEN_creative_id,
                    mask_zero=True)(input_creative_id)
-    x1 = LSTM(1024, return_sequences=True)(x1)
-    x1 = LSTM(512, return_sequences=True)(x1)
-    x1 = LSTM(256, return_sequences=False)(x1)
+    for _ in range(args.num_lstm):
+        x1 = Bidirectional(LSTM(256, return_sequences=True))(x1)
+    x1 = layers.GlobalMaxPooling1D()(x1)
 
     # second input
     input_ad_id = Input(shape=(None,), name='ad_id')
     x2 = Embedding(input_dim=NUM_ad_id,
                    output_dim=128,
                    weights=[ad_id_emb],
-                   trainable=True,
+                   trainable=args.not_train_embedding,
                    input_length=LEN_ad_id,
                    mask_zero=True)(input_ad_id)
-    x2 = LSTM(1024, return_sequences=True)(x2)
-    x2 = LSTM(512, return_sequences=True)(x2)
-    x2 = LSTM(256, return_sequences=False)(x2)
+    for _ in range(args.num_lstm):
+        x2 = Bidirectional(LSTM(256, return_sequences=True))(x2)
+    x2 = layers.GlobalMaxPooling1D()(x2)
 
     # third input
     input_product_id = Input(shape=(None,), name='product_id')
     x3 = Embedding(input_dim=NUM_product_id,
                    output_dim=128,
                    weights=[product_id_emb],
-                   trainable=True,
+                   trainable=args.not_train_embedding,
                    input_length=LEN_product_id,
                    mask_zero=True)(input_product_id)
-    x3 = LSTM(1024, return_sequences=True)(x3)
-    x3 = LSTM(512, return_sequences=True)(x3)
-    x3 = LSTM(256, return_sequences=False)(x3)
+    for _ in range(args.num_lstm):
+        x3 = Bidirectional(LSTM(256, return_sequences=True))(x3)
+    x3 = layers.GlobalMaxPooling1D()(x3)
 
     # concat x1 x2
     x = concatenate([x1, x2, x3])
-    x = Dense(128)(x)
-    x = Dropout(0.1)(x)
-    output_y = Dense(1, activation='sigmoid')(x)
+    # x = Dense(128)(x)
+    # x = Dropout(0.1)(x)
+    output_y = Dense(2, activation='softmax')(x)
 
     model = Model([input_creative_id, input_ad_id, input_product_id], output_y)
-    model.compile(loss='binary_crossentropy',
+    model.compile(loss='categorical_crossentropy',
                   optimizer='adam', metrics=['accuracy'])
     model.summary()
 
@@ -203,10 +208,34 @@ def get_gender_model(creative_id_emb, ad_id_emb, product_id_emb):
 
 
 # %%
-mail('start getting train data')
-x1_train, x1_val, x2_train, x2_val, x3_train, x3_val, y_train, y_val, creative_id_emb, ad_id_emb, product_id_emb = get_train_val()
-mail('get train data done.')
+if not args.load_from_npy:
+    mail('start getting train data')
+    x1_train, x1_val, x2_train, x2_val, x3_train, x3_val, y_train, y_val, creative_id_emb, ad_id_emb, product_id_emb = get_train_val()
+    mail('get train data done.')
+
+    def save_data(datas):
+        dirs = 'tmp/'
+        if not os.path.exists(dirs):
+            os.makedirs(dirs)
+        for i, data in enumerate(datas):
+            np.save(f'tmp/transformer_input_{i}.npy', data)
+    datas = [x1_train, x1_val, x2_train, x2_val, x3_train, x3_val,
+             y_train, y_val, creative_id_emb, ad_id_emb, product_id_emb]
+    save_data(datas)
+else:
+    x1_train = np.load('tmp/transformer_input_0.npy', allow_pickle=True)
+    x1_val = np.load('tmp/transformer_input_1.npy', allow_pickle=True)
+    x2_train = np.load('tmp/transformer_input_2.npy', allow_pickle=True)
+    x2_val = np.load('tmp/transformer_input_3.npy', allow_pickle=True)
+    x3_train = np.load('tmp/transformer_input_4.npy', allow_pickle=True)
+    x3_val = np.load('tmp/transformer_input_5.npy', allow_pickle=True)
+    y_train = np.load('tmp/transformer_input_6.npy', allow_pickle=True)
+    y_val = np.load('tmp/transformer_input_7.npy', allow_pickle=True)
+    creative_id_emb = np.load('tmp/transformer_input_8.npy', allow_pickle=True)
+    ad_id_emb = np.load('tmp/transformer_input_9.npy', allow_pickle=True)
+    product_id_emb = np.load('tmp/transformer_input_10.npy', allow_pickle=True)
 
+# %%
 model = get_gender_model(creative_id_emb, ad_id_emb, product_id_emb)
 # %%
 # %%
@@ -219,34 +248,28 @@ def get_gender_model(creative_id_emb, ad_id_emb, product_id_emb):
 # %%
 checkpoint = ModelCheckpoint("tmp/gender_epoch_{epoch:02d}.hdf5", monitor='val_loss', verbose=1,
                              save_best_only=False, mode='auto', period=1)
-# %%
-# model.fit(
-#     {'creative_id': x1_train, 'ad_id': x2_train},
-#     y_train,
-#     validation_data=([x1_val, x2_val], y_val),
-#     epochs=5,
-#     batch_size=256,
-#     callbacks=[checkpoint],
-# )
 
 # %%
 try:
+    examples = args.examples
     mail('start train lstm')
     model.fit(
-        {'creative_id': x1_train, 'ad_id': x2_train, 'product_id': x3_train},
-        y_train,
+        {'creative_id': x1_train[:examples], 'ad_id': x2_train[:examples],
+            'product_id': x3_train[:examples]},
+        y_train[:examples],
         validation_data=([x1_val, x2_val, x3_val], y_val),
-        epochs=3,
-        batch_size=256,
+        epochs=args.epoch,
+        batch_size=args.batch_size,
         callbacks=[checkpoint],
     )
-    mail('train gender lstm done!!!')
+    mail('train lstm done!!!')
 except Exception as e:
     e = str(e)
     mail('train lstm failed!!! ' + e)
 
 
 # %%
+# 后续为预测过程，暂时注释掉不使用但是不要删除
 # model.load_weights('tmp\gender_epoch_01.hdf5')
 
 
diff --git a/LSTM_gender_multi_input_old.py b/LSTM_gender_multi_input_old.py
new file mode 100644
index 0000000..f465851
--- /dev/null
+++ b/LSTM_gender_multi_input_old.py
@@ -0,0 +1,279 @@
+# %%
+# 生成词嵌入文件
+from tqdm import tqdm
+import numpy as np
+import pandas as pd
+from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
+from gensim.models import Word2Vec, KeyedVectors
+from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, concatenate
+from tensorflow.keras.models import Model, Sequential
+import tensorflow as tf
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+from mymail import mail
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+# %%
+# f = open('word2vec/userid_creative_ids.txt')
+# LEN_creative_id = -1
+# for line in f:
+#     current_line_len = len(line.strip().split(' '))
+#     LEN_creative_id = max(LEN_creative_id, current_line_len)
+# f.close()
+
+
+# %%
+NUM_creative_id = 2481135+1
+NUM_ad_id = 2264190+1
+NUM_product_id = 33273+1
+
+
+def get_train_val():
+
+    # 获取 creative_id 特征
+    # f = open('tmp/userid_creative_ids.txt')
+    f = open('word2vec/userid_creative_ids.txt')
+    tokenizer = Tokenizer(num_words=NUM_creative_id)
+    tokenizer.fit_on_texts(f)
+    f.close()
+    creative_id_seq = []
+    with open('word2vec/userid_creative_ids.txt', 'r') as f:
+        for text in f:
+            creative_id_seq.append(text.strip())
+
+    sequences = tokenizer.texts_to_sequences(creative_id_seq[:900000//1])
+    X1_train = pad_sequences(
+        sequences, maxlen=LEN_creative_id, padding='post')
+
+    # 获取creative_id embedding
+    def get_creative_id_emb():
+        path = "word2vec/wordvectors_creative_id.kv"
+        wv = KeyedVectors.load(path, mmap='r')
+        creative_id_tokens = list(wv.vocab.keys())
+        embedding_dim = 128
+        embedding_matrix = np.random.randn(
+            len(creative_id_tokens)+1, embedding_dim)
+        for creative_id in creative_id_tokens:
+            embedding_vector = wv[creative_id]
+            if embedding_vector is not None:
+                index = tokenizer.texts_to_sequences([creative_id])[0][0]
+                embedding_matrix[index] = embedding_vector
+        return embedding_matrix
+
+    creative_id_emb = get_creative_id_emb()
+
+    # 获取 ad_id 特征
+    f = open('word2vec/userid_ad_ids.txt')
+    tokenizer = Tokenizer(num_words=NUM_ad_id)
+    tokenizer.fit_on_texts(f)
+    f.close()
+    ad_id_seq = []
+    with open('word2vec/userid_ad_ids.txt') as f:
+        for text in f:
+            ad_id_seq.append(text.strip())
+
+    sequences = tokenizer.texts_to_sequences(ad_id_seq[:900000//1])
+    X2_train = pad_sequences(
+        sequences, maxlen=LEN_ad_id, padding='post')
+
+    def get_ad_id_emb():
+        path = "word2vec/wordvectors_ad_id.kv"
+        wv = KeyedVectors.load(path, mmap='r')
+        ad_id_tokens = list(wv.vocab.keys())
+        embedding_dim = 128
+        embedding_matrix = np.random.randn(
+            len(ad_id_tokens)+1, embedding_dim)
+        for ad_id in ad_id_tokens:
+            embedding_vector = wv[ad_id]
+            if embedding_vector is not None:
+                index = tokenizer.texts_to_sequences([ad_id])[0][0]
+                embedding_matrix[index] = embedding_vector
+        return embedding_matrix
+
+    ad_id_emb = get_ad_id_emb()
+
+    # 获取 product_id 特征
+    # f = open('tmp/userid_product_ids.txt')
+    f = open('word2vec/userid_product_ids.txt')
+    tokenizer = Tokenizer(num_words=NUM_product_id)
+    tokenizer.fit_on_texts(f)
+    f.close()
+    product_id_seq = []
+    with open('word2vec/userid_product_ids.txt') as f:
+        for text in f:
+            product_id_seq.append(text.strip())
+
+    sequences = tokenizer.texts_to_sequences(product_id_seq[:900000//1])
+    X3_train = pad_sequences(
+        sequences, maxlen=LEN_product_id, padding='post')
+
+    # 获取product_id embedding
+    def get_product_id_emb():
+        path = "word2vec/wordvectors_product_id.kv"
+        wv = KeyedVectors.load(path, mmap='r')
+        product_id_tokens = list(wv.vocab.keys())
+        embedding_dim = 128
+        embedding_matrix = np.random.randn(
+            len(product_id_tokens)+1, embedding_dim)
+        for product_id in product_id_tokens:
+            embedding_vector = wv[product_id]
+            if embedding_vector is not None:
+                index = tokenizer.texts_to_sequences([product_id])[0][0]
+                embedding_matrix[index] = embedding_vector
+        return embedding_matrix
+
+    product_id_emb = get_product_id_emb()
+
+    # 获得gender标签
+    user_train = pd.read_csv(
+        'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,))
+    Y_gender = user_train['gender'].values
+    Y_age = user_train['age'].values
+    Y_gender = Y_gender - 1
+    num_examples = Y_gender.shape[0]
+    train_examples = int(num_examples * 0.9)
+
+    # 分别对应 x1_train x1_val x2_train x2_val y_train y_val
+    return X1_train[:train_examples], X1_train[train_examples:], X2_train[:train_examples], X2_train[train_examples:], X3_train[:train_examples], X3_train[train_examples:], Y_gender[:train_examples], Y_gender[train_examples:], creative_id_emb, ad_id_emb, product_id_emb
+
+# %%
+
+
+def get_test():
+    pass
+
+
+# %%
+LEN_creative_id = 100
+LEN_ad_id = 100
+LEN_product_id = 100
+
+
+def get_gender_model(creative_id_emb, ad_id_emb, product_id_emb):
+    # shape：(sequence长度, )
+    # first input
+    input_creative_id = Input(shape=(None,), name='creative_id')
+    x1 = Embedding(input_dim=NUM_creative_id,
+                   output_dim=128,
+                   weights=[creative_id_emb],
+                   trainable=True,
+                   input_length=LEN_creative_id,
+                   mask_zero=True)(input_creative_id)
+    x1 = LSTM(1024, return_sequences=True)(x1)
+    x1 = LSTM(512, return_sequences=True)(x1)
+    x1 = LSTM(256, return_sequences=False)(x1)
+
+    # second input
+    input_ad_id = Input(shape=(None,), name='ad_id')
+    x2 = Embedding(input_dim=NUM_ad_id,
+                   output_dim=128,
+                   weights=[ad_id_emb],
+                   trainable=True,
+                   input_length=LEN_ad_id,
+                   mask_zero=True)(input_ad_id)
+    x2 = LSTM(1024, return_sequences=True)(x2)
+    x2 = LSTM(512, return_sequences=True)(x2)
+    x2 = LSTM(256, return_sequences=False)(x2)
+
+    # third input
+    input_product_id = Input(shape=(None,), name='product_id')
+    x3 = Embedding(input_dim=NUM_product_id,
+                   output_dim=128,
+                   weights=[product_id_emb],
+                   trainable=True,
+                   input_length=LEN_product_id,
+                   mask_zero=True)(input_product_id)
+    x3 = LSTM(1024, return_sequences=True)(x3)
+    x3 = LSTM(512, return_sequences=True)(x3)
+    x3 = LSTM(256, return_sequences=False)(x3)
+
+    # concat x1 x2
+    x = concatenate([x1, x2, x3])
+    x = Dense(128)(x)
+    x = Dropout(0.1)(x)
+    output_y = Dense(1, activation='sigmoid')(x)
+
+    model = Model([input_creative_id, input_ad_id, input_product_id], output_y)
+    model.compile(loss='binary_crossentropy',
+                  optimizer='adam', metrics=['accuracy'])
+    model.summary()
+
+    return model
+
+
+# %%
+mail('start getting train data')
+x1_train, x1_val, x2_train, x2_val, x3_train, x3_val, y_train, y_val, creative_id_emb, ad_id_emb, product_id_emb = get_train_val()
+mail('get train data done.')
+
+model = get_gender_model(creative_id_emb, ad_id_emb, product_id_emb)
+# %%
+# %%
+# 测试数据格式(batch_size, sequence长度)
+# x1 = np.array([1, 2, 3, 4]).reshape(1, -1)
+# x2 = np.array([1, 2, 3, 4]).reshape(1, -1)
+# model.predict([x1, x2])
+
+
+# %%
+checkpoint = ModelCheckpoint("tmp/gender_epoch_{epoch:02d}.hdf5", monitor='val_loss', verbose=1,
+                             save_best_only=False, mode='auto', period=1)
+# %%
+# model.fit(
+#     {'creative_id': x1_train, 'ad_id': x2_train},
+#     y_train,
+#     validation_data=([x1_val, x2_val], y_val),
+#     epochs=5,
+#     batch_size=256,
+#     callbacks=[checkpoint],
+# )
+
+# %%
+try:
+    mail('start train lstm')
+    model.fit(
+        {'creative_id': x1_train, 'ad_id': x2_train, 'product_id': x3_train},
+        y_train,
+        validation_data=([x1_val, x2_val, x3_val], y_val),
+        epochs=3,
+        batch_size=256,
+        callbacks=[checkpoint],
+    )
+    mail('train gender lstm done!!!')
+except Exception as e:
+    e = str(e)
+    mail('train lstm failed!!! ' + e)
+
+
+# %%
+# model.load_weights('tmp\gender_epoch_01.hdf5')
+
+
+# # %%
+# if debug:
+#     sequences = tokenizer.texts_to_sequences(
+#         creative_id_seq[900000:])
+# else:
+#     sequences = tokenizer.texts_to_sequences(
+#         creative_id_seq[900000:])
+
+# X_test = pad_sequences(sequences, maxlen=LEN_creative_id)
+# # %%
+# y_pred = model.predict(X_test, batch_size=4096)
+
+# y_pred = np.where(y_pred > 0.5, 1, 0)
+# y_pred = y_pred.flatten()
+
+# # %%
+# y_pred = y_pred+1
+# # %%
+# res = pd.DataFrame({'predicted_gender': y_pred})
+# res.to_csv(
+#     'data/ans/lstm_gender.csv', header=True, columns=['predicted_gender'], index=False)
+
+
+# # %%
+# mail('predict lstm gender done')
+
+# %%
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..bca50c0
--- /dev/null
+++ b/test.py
@@ -0,0 +1,98 @@
+# %%
+import random
+import unittest
+
+from transformers import is_torch_available
+
+import transformers
+from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
+import torch
+
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from pylab import rcParams
+import matplotlib.pyplot as plt
+from matplotlib import rc
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix, classification_report
+from collections import defaultdict
+from textwrap import wrap
+
+from torch import nn, optim
+from torch.utils.data import Dataset, DataLoader
+import torch.nn.functional as F
+
+import pandas as pd
+
+if is_torch_available():
+    from transformers import (
+        BertConfig,
+        BertModel,
+        BertForMaskedLM,
+        BertForNextSentencePrediction,
+        BertForPreTraining,
+        BertForQuestionAnswering,
+        BertForSequenceClassification,
+        BertForTokenClassification,
+        BertForMultipleChoice,
+    )
+    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+# %%
+PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
+EPOCHS = 10
+
+
+# %%
+creative_id_seq = []
+cnt = 0
+with open('word2vec/userid_creative_ids.txt', 'r') as f:
+    for text in f:
+        creative_id_seq.append(text.strip())
+        cnt += 1
+        if cnt == 90:
+            break
+with open('tmp/tmp.txt', 'w')as f:
+    f.write('[PAD]\n[UNK]\n[CLS]\n[SEP]\n')
+    s = set()
+    for seq in creative_id_seq:
+        seq = seq.split(' ')
+        s = s | set(seq)
+    for e in s:
+        f.write(str(e)+'\n')
+
+
+# %%
+user_train = pd.read_csv(
+    'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,))
+Y_gender = user_train['gender'].values
+Y_age = user_train['age'].values
+Y_gender = Y_gender - 1
+Y_age = Y_age - 1
+# Y_age = to_categorical(Y_age)
+
+
+# %%
+tokenizer = BertTokenizer('tmp/tmp.txt')
+print(tokenizer.get_vocab())
+sample_txt = '456 1 23 456 89 89'
+# tokenizer.tokenize(sample_txt)
+
+
+# %%
+
+encoding = tokenizer.encode_plus(
+    sample_txt,
+    max_length=32,
+    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
+    return_token_type_ids=False,
+    pad_to_max_length=True,
+    return_attention_mask=True,
+    return_tensors='pt',  # Return PyTorch tensors
+)
+# encoding.keys()
+# encoding['input_ids']
+# encoding['attention_mask']
+# tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])