diff --git a/Transformer_keras_new.py b/Transformer_keras_new.py new file mode 100644 index 0000000..e759091 --- /dev/null +++ b/Transformer_keras_new.py @@ -0,0 +1,668 @@ +# %% +# 生成词嵌入文件 +import os +from tqdm import tqdm +import numpy as np +import pandas as pd +import argparse +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers +from tensorflow.keras import losses +from tensorflow.keras import optimizers +from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler +from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, Concatenate, Bidirectional +from tensorflow.keras.models import Model, Sequential +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.utils import to_categorical +from gensim.models import Word2Vec, KeyedVectors +from mymail import mail + +import os +import numpy as np +import tensorflow as tf +import tensorflow.keras.backend as K +from tensorflow.keras.callbacks import Callback +from layers import PositionEncoding +from layers import MultiHeadAttention, PositionWiseFeedForward +from layers import Add, LayerNormalization + +tf.config.experimental_run_functions_eagerly(True) + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + +''' +python Transformer_keras.py --load_from_npy --batch_size 256 --epoch 5 --num_transformer 1 --head_attention 1 --num_lstm 1 --examples 100000 +''' + +# %% +parser = argparse.ArgumentParser() +parser.add_argument('--load_from_npy', action='store_true', + help='从npy文件加载数据', + default=False) +parser.add_argument('--not_train_embedding', action='store_false', + help='从npy文件加载数据', + default=True) +parser.add_argument('--batch_size', type=int, + help='batch size大小', + default=256) +parser.add_argument('--epoch', type=int, + help='epoch 大小', + default=5) +parser.add_argument('--num_transformer', type=int, + help='transformer层数', + default=1) +parser.add_argument('--head_attention', type=int, + help='transformer head个数', + default=1) +parser.add_argument('--num_lstm', type=int, + help='LSTM 个数', + default=1) +parser.add_argument('--train_examples', type=int, + help='训练数据,默认为训练集,不包含验证集,调试时候可以设置1000', + default=810000) +parser.add_argument('--val_examples', type=int, + help='验证集数据,调试时候可以设置1000', + default=90000) +args = parser.parse_args() +# %% + + +class Transformer(tf.keras.layers.Layer): + + def __init__(self, vocab_size, model_dim, + n_heads=8, encoder_stack=6, decoder_stack=6, feed_forward_size=2048, dropout_rate=0.1, **kwargs): + self._vocab_size = vocab_size + self._model_dim = model_dim + self._n_heads = n_heads + self._encoder_stack = encoder_stack + self._decoder_stack = decoder_stack + self._feed_forward_size = feed_forward_size + self._dropout_rate = dropout_rate + super(Transformer, self).__init__(**kwargs) + + def build(self, input_shape): + self.embeddings = self.add_weight( + shape=(self._vocab_size, self._model_dim), + initializer='glorot_uniform', + trainable=True, + name="embeddings") + super(Transformer, self).build(input_shape) + + def encoder(self, inputs): + if K.dtype(inputs) != 'int32': + inputs = K.cast(inputs, 'int32') + + masks = K.equal(inputs, 0) + # Embeddings + embeddings = K.gather(self.embeddings, inputs) + embeddings *= self._model_dim ** 0.5 # Scale + + # Position Encodings + position_encodings = PositionEncoding(self._model_dim)(embeddings) + + # Embedings + Postion-encodings + # encodings = embeddings + position_encodings + encodings = embeddings + # Dropout + encodings = K.dropout(encodings, self._dropout_rate) + + for i in range(self._encoder_stack): + # Multi-head-Attention + attention = MultiHeadAttention( + self._n_heads, self._model_dim // self._n_heads) + attention_input = [encodings, encodings, encodings, masks] + attention_out = attention(attention_input) + # Add & Norm + attention_out += encodings + attention_out = LayerNormalization()(attention_out) + # Feed-Forward + ff = PositionWiseFeedForward( + self._model_dim, self._feed_forward_size) + ff_out = ff(attention_out) + # Add & Norm + ff_out += attention_out + encodings = LayerNormalization()(ff_out) + + return encodings, masks + + def decoder(self, inputs): + decoder_inputs, encoder_encodings, encoder_masks = inputs + if K.dtype(decoder_inputs) != 'int32': + decoder_inputs = K.cast(decoder_inputs, 'int32') + + decoder_masks = K.equal(decoder_inputs, 0) + # Embeddings + embeddings = K.gather(self.embeddings, decoder_inputs) + embeddings *= self._model_dim ** 0.5 # Scale + # Position Encodings + position_encodings = PositionEncoding(self._model_dim)(embeddings) + # Embedings + Postion-encodings + encodings = embeddings + position_encodings + # Dropout + encodings = K.dropout(encodings, self._dropout_rate) + + for i in range(self._decoder_stack): + # Masked-Multi-head-Attention + masked_attention = MultiHeadAttention( + self._n_heads, self._model_dim // self._n_heads, future=True) + masked_attention_input = [encodings, + encodings, encodings, decoder_masks] + masked_attention_out = masked_attention(masked_attention_input) + # Add & Norm + masked_attention_out += encodings + masked_attention_out = LayerNormalization()(masked_attention_out) + + # Multi-head-Attention + attention = MultiHeadAttention( + self._n_heads, self._model_dim // self._n_heads) + attention_input = [masked_attention_out, + encoder_encodings, encoder_encodings, encoder_masks] + attention_out = attention(attention_input) + # Add & Norm + attention_out += masked_attention_out + attention_out = LayerNormalization()(attention_out) + + # Feed-Forward + ff = PositionWiseFeedForward( + self._model_dim, self._feed_forward_size) + ff_out = ff(attention_out) + # Add & Norm + ff_out += attention_out + encodings = LayerNormalization()(ff_out) + + # Pre-Softmax 与 Embeddings 共享参数 + linear_projection = K.dot(encodings, K.transpose(self.embeddings)) + outputs = K.softmax(linear_projection) + return outputs + + def call(self, inputs): + encoder_inputs, decoder_inputs = inputs + encoder_encodings, encoder_masks = self.encoder(encoder_inputs) + encoder_outputs = self.decoder( + [decoder_inputs, encoder_encodings, encoder_masks]) + # return encoder_outputs + return encoder_encodings + + def compute_output_shape(self, input_shape): + return (input_shape[0][0], input_shape[0][1], self._vocab_size) + + +class Noam(Callback): + + def __init__(self, model_dim, step_num=0, warmup_steps=4000, verbose=False, **kwargs): + self._model_dim = model_dim + self._step_num = step_num + self._warmup_steps = warmup_steps + self.verbose = verbose + super(Noam, self).__init__(**kwargs) + + def on_train_begin(self, logs=None): + logs = logs or {} + init_lr = self._model_dim ** -.5 * self._warmup_steps ** -1.5 + K.set_value(self.model.optimizer.lr, init_lr) + + def on_batch_end(self, epoch, logs=None): + logs = logs or {} + self._step_num += 1 + lrate = self._model_dim ** -.5 * \ + K.minimum(self._step_num ** -.5, self._step_num * + self._warmup_steps ** -1.5) + K.set_value(self.model.optimizer.lr, lrate) + + def on_epoch_begin(self, epoch, logs=None): + if self.verbose: + lrate = K.get_value(self.model.optimizer.lr) + print(f"epoch {epoch} lr: {lrate}") + + def on_epoch_end(self, epoch, logs=None): + logs = logs or {} + logs['lr'] = K.get_value(self.model.optimizer.lr) + + +def label_smoothing(inputs, epsilon=0.1): + + output_dim = inputs.shape[-1] + smooth_label = (1 - epsilon) * inputs + (epsilon / output_dim) + return smooth_label + + +# %% +NUM_creative_id = 3412772 +NUM_ad_id = 3027360 +NUM_product_id = 39057 +NUM_advertiser_id = 57870 +NUM_industry = 332 +NUM_product_category = 18 + +vocab_size = 5000 +max_seq_len = 100 +model_dim = 128 + +input_creative_id = Input(shape=(max_seq_len,), name='encoder_inputs') +decoder_inputs = Input(shape=(max_seq_len,), name='input_creative_id') + +X1 = Transformer(NUM_creative_id, model_dim)([encoder_inputs, decoder_inputs]) + +model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs) + +model.summary() + + +# %% + +LEN_creative_id = 100 +LEN_ad_id = 100 +LEN_product_id = 100 +LEN_advertiser_id = 100 +LEN_industry = 100 +LEN_product_category = 100 + +# vocab_size = NUM_creative_id +maxlen = 100 + + +def get_model(creative_id_emb, ad_id_emb, product_id_emb): + embed_dim = 128 # Embedding size for each token + num_heads = 1 # Number of attention heads + ff_dim = 256 # Hidden layer size in feed forward network inside transformer + # shape:(sequence长度, ) + # first input + input_creative_id = Input(shape=(None,), name='creative_id') + x1 = TokenAndPositionEmbedding( + maxlen, NUM_creative_id, embed_dim, creative_id_emb)(input_creative_id) + for _ in range(args.num_transformer): + x1 = TransformerBlock(embed_dim, num_heads, ff_dim)(x1) + + for _ in range(args.num_lstm): + x1 = Bidirectional(LSTM(256, return_sequences=True))(x1) + x1 = layers.GlobalMaxPooling1D()(x1) + + # second input + input_ad_id = Input(shape=(None,), name='ad_id') + + x2 = TokenAndPositionEmbedding( + maxlen, NUM_ad_id, embed_dim, ad_id_emb)(input_ad_id) + for _ in range(args.num_transformer): + x2 = TransformerBlock(embed_dim, num_heads, ff_dim)(x2) + for _ in range(args.num_lstm): + x2 = Bidirectional(LSTM(256, return_sequences=True))(x2) + # x2 = Bidirectional(LSTM(256, return_sequences=False))(x2) + x2 = layers.GlobalMaxPooling1D()(x2) + + # third input + input_product_id = Input(shape=(None,), name='product_id') + + x3 = TokenAndPositionEmbedding( + maxlen, NUM_product_id, embed_dim, product_id_emb)(input_product_id) + for _ in range(args.num_transformer): + x3 = TransformerBlock(embed_dim, num_heads, ff_dim)(x3) + for _ in range(args.num_lstm): + x3 = Bidirectional(LSTM(256, return_sequences=True))(x3) + # x3 = Bidirectional(LSTM(256, return_sequences=False))(x3) + x3 = layers.GlobalMaxPooling1D()(x3) + + # concat x1 x2 x3 + x = Concatenate(axis=1)([x1, x2, x3]) + # x = x1 + x2 + x3 + x = Dense(20)(x) + # x = Dropout(0.1)(x) + output_y = Dense(10, activation='softmax')(x) + + model = Model([input_creative_id, input_ad_id, input_product_id], output_y) + # model = Model(input_creative_id, outputs) + model.compile(loss='categorical_crossentropy', + optimizer='adam', metrics=['accuracy']) + model.summary() + + return model + + +# %% +def get_model_head_concat(DATA): + embed_dim = 128 # Embedding size for each token + num_heads = args.head_attention # Number of attention heads + ff_dim = 256 # Hidden layer size in feed forward network inside transformer + # shape:(sequence长度, ) + # first input + input_creative_id = Input(shape=(None,), name='creative_id') + x1 = TokenAndPositionEmbedding( + maxlen, NUM_creative_id+1, embed_dim, DATA['creative_id_emb'])(input_creative_id) + + input_ad_id = Input(shape=(None,), name='ad_id') + x2 = TokenAndPositionEmbedding( + maxlen, NUM_ad_id+1, embed_dim, DATA['ad_id_emb'])(input_ad_id) + + input_product_id = Input(shape=(None,), name='product_id') + x3 = TokenAndPositionEmbedding( + maxlen, NUM_product_id+1, embed_dim, DATA['product_id_emb'])(input_product_id) + + input_advertiser_id = Input(shape=(None,), name='advertiser_id') + x4 = TokenAndPositionEmbedding( + maxlen, NUM_advertiser_id+1, embed_dim, DATA['advertiser_id_emb'])(input_advertiser_id) + + input_industry = Input(shape=(None,), name='industry') + x5 = TokenAndPositionEmbedding( + maxlen, NUM_industry+1, embed_dim, DATA['industry_emb'])(input_industry) + + input_product_category = Input(shape=(None,), name='product_category') + x6 = TokenAndPositionEmbedding( + maxlen, NUM_product_category+1, embed_dim, DATA['product_category_emb'])(input_product_category) + + # concat + # x = x1 + x2 + x3 + x = layers.Concatenate(axis=1)([x1, x2, x3, x4, x5, x6]) + + for _ in range(args.num_transformer): + x = TransformerBlock(embed_dim, num_heads, ff_dim)(x) + + for _ in range(args.num_lstm): + x = Bidirectional(LSTM(256, return_sequences=True))(x) + x = layers.GlobalMaxPooling1D()(x) + + output_gender = Dense(2, activation='softmax', name='gender')(x) + output_age = Dense(10, activation='softmax', name='age')(x) + + model = Model( + [ + input_creative_id, + input_ad_id, + input_product_id, + input_advertiser_id, + input_industry, + input_product_category + ], + [ + output_gender, + output_age + ] + ) + model.compile( + optimizer=optimizers.Adam(1e-4), + loss={'gender': losses.CategoricalCrossentropy(from_logits=False), + 'age': losses.CategoricalCrossentropy(from_logits=False)}, + loss_weights=[0.4, 0.6], + metrics=['accuracy']) + model.summary() + + return model + + +# %% + + +def get_train_val(): + + # 提取词向量文件 + def get_embedding(feature_name, tokenizer): + path = f'word2vec_new/{feature_name}.kv' + wv = KeyedVectors.load(path, mmap='r') + feature_tokens = list(wv.vocab.keys()) + embedding_dim = 128 + embedding_matrix = np.random.randn( + len(feature_tokens)+1, embedding_dim) + for feature in feature_tokens: + embedding_vector = wv[feature] + if embedding_vector is not None: + index = tokenizer.texts_to_sequences([feature])[0][0] + embedding_matrix[index] = embedding_vector + return embedding_matrix + + # 从序列文件提取array格式数据 + def get_train(feature_name, vocab_size, len_feature): + f = open(f'word2vec_new/{feature_name}.txt') + tokenizer = Tokenizer(num_words=vocab_size) + tokenizer.fit_on_texts(f) + f.close() + + feature_seq = [] + with open(f'word2vec_new/{feature_name}.txt') as f: + for text in f: + feature_seq.append(text.strip()) + + sequences = tokenizer.texts_to_sequences(feature_seq[:900000//1]) + X_train = pad_sequences( + sequences, maxlen=len_feature, padding='post') + return X_train, tokenizer + + # 构造输出的训练标签 + # 获得age、gender标签 + DATA = {} + + user_train = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) + Y_gender = user_train['gender'].values + Y_age = user_train['age'].values + Y_gender = Y_gender - 1 + Y_age = Y_age - 1 + Y_age = to_categorical(Y_age) + Y_gender = to_categorical(Y_gender) + + num_examples = Y_age.shape[0] + train_examples = int(num_examples * 0.9) + + DATA['Y_gender_train'] = Y_gender[:train_examples] + DATA['Y_gender_val'] = Y_gender[train_examples:] + DATA['Y_age_train'] = Y_age[:train_examples] + DATA['Y_age_val'] = Y_age[train_examples:] + + # 第一个输入 + print('获取 creative_id 特征') + X1_train, tokenizer = get_train( + 'creative_id', NUM_creative_id+1, LEN_creative_id) # +1为了UNK的creative_id + creative_id_emb = get_embedding('creative_id', tokenizer) + + DATA['X1_train'] = X1_train[:train_examples] + DATA['X1_val'] = X1_train[train_examples:] + DATA['creative_id_emb'] = creative_id_emb + + # 第二个输入 + print('获取 ad_id 特征') + X2_train, tokenizer = get_train( + 'ad_id', NUM_ad_id+1, LEN_ad_id) + ad_id_emb = get_embedding('ad_id', tokenizer) + + DATA['X2_train'] = X2_train[:train_examples] + DATA['X2_val'] = X2_train[train_examples:] + DATA['ad_id_emb'] = ad_id_emb + + # 第三个输入 + print('获取 product_id 特征') + X3_train, tokenizer = get_train( + 'product_id', NUM_product_id+1, LEN_product_id) + product_id_emb = get_embedding('product_id', tokenizer) + + DATA['X3_train'] = X3_train[:train_examples] + DATA['X3_val'] = X3_train[train_examples:] + DATA['product_id_emb'] = product_id_emb + + # 第四个输入 + print('获取 advertiser_id 特征') + X4_train, tokenizer = get_train( + 'advertiser_id', NUM_advertiser_id+1, LEN_advertiser_id) + advertiser_id_emb = get_embedding('advertiser_id', tokenizer) + + DATA['X4_train'] = X4_train[:train_examples] + DATA['X4_val'] = X4_train[train_examples:] + DATA['advertiser_id_emb'] = advertiser_id_emb + + # 第五个输入 + print('获取 industry 特征') + X5_train, tokenizer = get_train( + 'industry', NUM_industry+1, LEN_industry) + industry_emb = get_embedding('industry', tokenizer) + + DATA['X5_train'] = X5_train[:train_examples] + DATA['X5_val'] = X5_train[train_examples:] + DATA['industry_emb'] = industry_emb + + # 第六个输入 + print('获取 product_category 特征') + X6_train, tokenizer = get_train( + 'product_category', NUM_product_category+1, LEN_product_category) + product_category_emb = get_embedding('product_category', tokenizer) + + DATA['X6_train'] = X6_train[:train_examples] + DATA['X6_val'] = X6_train[train_examples:] + DATA['product_category_emb'] = product_category_emb + + return DATA + + +# %% +if not args.load_from_npy: + mail('start getting train data') + print('从csv文件提取训练数据到array格式,大概十几分钟时间') + DATA = get_train_val() + mail('get train data done.') + + # 训练数据保存为npy文件 + dirs = 'tmp/' + if not os.path.exists(dirs): + os.makedirs(dirs) + + def save_npy(datas, name): + for i, data in enumerate(datas): + np.save(f'tmp/{name}_{i}.npy', data) + print(f'saving tmp/{name}_{i}.npy') + + inputs = [ + DATA['X1_train'], DATA['X1_val'], + DATA['X2_train'], DATA['X2_val'], + DATA['X3_train'], DATA['X3_val'], + DATA['X4_train'], DATA['X4_val'], + DATA['X5_train'], DATA['X5_val'], + DATA['X6_train'], DATA['X6_val'], + ] + outputs_gender = [DATA['Y_gender_train'], DATA['Y_gender_val']] + outputs_age = [DATA['Y_age_train'], DATA['Y_age_val']] + embeddings = [ + DATA['creative_id_emb'], + DATA['ad_id_emb'], + DATA['product_id_emb'], + DATA['advertiser_id_emb'], + DATA['industry_emb'], + DATA['product_category_emb'], + ] + save_npy(inputs, 'inputs') + save_npy(outputs_gender, 'gender') + save_npy(outputs_age, 'age') + save_npy(embeddings, 'embeddings') +else: + DATA = {} + DATA['X1_train'] = np.load('tmp/inputs_0.npy', allow_pickle=True) + DATA['X1_val'] = np.load('tmp/inputs_1.npy', allow_pickle=True) + DATA['X2_train'] = np.load('tmp/inputs_2.npy', allow_pickle=True) + DATA['X2_val'] = np.load('tmp/inputs_3.npy', allow_pickle=True) + DATA['X3_train'] = np.load('tmp/inputs_4.npy', allow_pickle=True) + DATA['X3_val'] = np.load('tmp/inputs_5.npy', allow_pickle=True) + DATA['X4_train'] = np.load('tmp/inputs_6.npy', allow_pickle=True) + DATA['X4_val'] = np.load('tmp/inputs_7.npy', allow_pickle=True) + DATA['X5_train'] = np.load('tmp/inputs_8.npy', allow_pickle=True) + DATA['X5_val'] = np.load('tmp/inputs_9.npy', allow_pickle=True) + DATA['X6_train'] = np.load('tmp/inputs_10.npy', allow_pickle=True) + DATA['X6_val'] = np.load('tmp/inputs_11.npy', allow_pickle=True) + DATA['Y_gender_train'] = np.load('tmp/gender_0.npy', allow_pickle=True) + DATA['Y_gender_val'] = np.load('tmp/gender_1.npy', allow_pickle=True) + DATA['Y_age_train'] = np.load('tmp/age_0.npy', allow_pickle=True) + DATA['Y_age_val'] = np.load('tmp/age_1.npy', allow_pickle=True) + DATA['creative_id_emb'] = np.load( + 'tmp/embeddings_0.npy', allow_pickle=True) + DATA['ad_id_emb'] = np.load( + 'tmp/embeddings_1.npy', allow_pickle=True) + DATA['product_id_emb'] = np.load( + 'tmp/embeddings_2.npy', allow_pickle=True) + DATA['advertiser_id_emb'] = np.load( + 'tmp/embeddings_3.npy', allow_pickle=True) + DATA['industry_emb'] = np.load( + 'tmp/embeddings_4.npy', allow_pickle=True) + DATA['product_category_emb'] = np.load( + 'tmp/embeddings_5.npy', allow_pickle=True) + + +# %% +# model = get_age_model(creative_id_emb, ad_id_emb, product_id_emb) +model = get_model_head_concat(DATA) +# %% +# 测试数据格式(batch_size, sequence长度) +# x1 = np.array([1, 2, 3, 4]).reshape(1, -1) +# x2 = np.array([1, 2, 3, 4]).reshape(1, -1) +# model.predict([x1, x2]) + + +# %% +checkpoint = ModelCheckpoint("tmp/transformer_epoch_{epoch:02d}.hdf5", save_weights_only=True, monitor='val_loss', verbose=1, + save_best_only=False, mode='auto', period=1) +# %% +try: + train_examples = args.train_examples + val_examples = args.val_examples + mail('start train') + model.fit( + { + 'creative_id': DATA['X1_train'][:train_examples], + 'ad_id': DATA['X2_train'][:train_examples], + 'product_id': DATA['X3_train'][:train_examples], + 'advertiser_id': DATA['X4_train'][:train_examples], + 'industry': DATA['X5_train'][:train_examples], + 'product_category': DATA['X6_train'][:train_examples] + }, + { + 'gender': DATA['Y_gender_train'][:train_examples], + 'age': DATA['Y_age_train'][:train_examples], + }, + validation_data=( + { + 'creative_id': DATA['X1_val'][:val_examples], + 'ad_id': DATA['X2_val'][:val_examples], + 'product_id': DATA['X3_val'][:val_examples], + 'advertiser_id': DATA['X4_val'][:val_examples], + 'industry': DATA['X5_val'][:val_examples], + 'product_category': DATA['X6_val'][:val_examples] + }, + { + 'gender': DATA['Y_gender_val'][:val_examples], + 'age': DATA['Y_age_val'][:val_examples], + }, + ), + epochs=args.epoch, + batch_size=args.batch_size, + callbacks=[checkpoint], + ) + mail('train done!!!') +except Exception as e: + e = str(e) + mail('train failed!!! ' + e) + +# %% +# model.load_weights('tmp/gender_epoch_01.hdf5') + + +# # %% +# if debug: +# sequences = tokenizer.texts_to_sequences( +# creative_id_seq[900000:]) +# else: +# sequences = tokenizer.texts_to_sequences( +# creative_id_seq[900000:]) + +# X_test = pad_sequences(sequences, maxlen=LEN_creative_id) +# # %% +# y_pred = model.predict(X_test, batch_size=4096) + +# y_pred = np.where(y_pred > 0.5, 1, 0) +# y_pred = y_pred.flatten() + +# # %% +# y_pred = y_pred+1 +# # %% +# res = pd.DataFrame({'predicted_gender': y_pred}) +# res.to_csv( +# 'data/ans/lstm_gender.csv', header=True, columns=['predicted_gender'], index=False) + + +# # %% +# mail('predict lstm gender done') + +# %% diff --git a/layers.py b/layers.py new file mode 100644 index 0000000..08e1227 --- /dev/null +++ b/layers.py @@ -0,0 +1,332 @@ +''' +@Description: +@version: +@License: MIT +@Author: Wang Yao +@Date: 2020-03-22 17:48:05 +@LastEditors: Wang Yao +@LastEditTime: 2020-03-26 18:35:10 +''' +from __future__ import print_function + +import os +import numpy as np +import tensorflow as tf +import tensorflow.keras.backend as K +from tensorflow.keras.layers import Layer + + +class Embedding(Layer): + + def __init__(self, vocab_size, model_dim, **kwargs): + self._vocab_size = vocab_size + self._model_dim = model_dim + super(Embedding, self).__init__(**kwargs) + + def build(self, input_shape): + self.embeddings = self.add_weight( + shape=(self._vocab_size, self._model_dim), + initializer='glorot_uniform', + name="embeddings") + super(Embedding, self).build(input_shape) + + def call(self, inputs): + if K.dtype(inputs) != 'int32': + inputs = K.cast(inputs, 'int32') + embeddings = K.gather(self.embeddings, inputs) + embeddings *= self._model_dim ** 0.5 # Scale + return embeddings + + def compute_output_shape(self, input_shape): + + return input_shape + (self._model_dim,) + + +class PositionEncoding(Layer): + + def __init__(self, model_dim, **kwargs): + self._model_dim = model_dim + super(PositionEncoding, self).__init__(**kwargs) + + def call(self, inputs): + seq_length = inputs.shape[1] + position_encodings = np.zeros((seq_length, self._model_dim)) + for pos in range(seq_length): + for i in range(self._model_dim): + position_encodings[pos, i] = pos / \ + np.power(10000, (i-i % 2) / self._model_dim) + position_encodings[:, 0::2] = np.sin(position_encodings[:, 0::2]) # 2i + position_encodings[:, 1::2] = np.cos( + position_encodings[:, 1::2]) # 2i+1 + position_encodings = K.cast(position_encodings, 'float32') + return position_encodings + + def compute_output_shape(self, input_shape): + return input_shape + + +class ScaledDotProductAttention(Layer): + + def __init__(self, masking=True, future=False, dropout_rate=0., **kwargs): + self._masking = masking + self._future = future + self._dropout_rate = dropout_rate + self._masking_num = -2**32+1 + super(ScaledDotProductAttention, self).__init__(**kwargs) + + def mask(self, inputs, masks): + masks = K.cast(masks, 'float32') + masks = K.tile(masks, [K.shape(inputs)[0] // K.shape(masks)[0], 1]) + masks = K.expand_dims(masks, 1) + outputs = inputs + masks * self._masking_num + return outputs + + def future_mask(self, inputs): + diag_vals = tf.ones_like(inputs[0, :, :]) + tril = tf.linalg.LinearOperatorLowerTriangular(diag_vals).to_dense() + future_masks = tf.tile(tf.expand_dims(tril, 0), [ + tf.shape(inputs)[0], 1, 1]) + paddings = tf.ones_like(future_masks) * self._masking_num + outputs = tf.where(tf.equal(future_masks, 0), paddings, inputs) + return outputs + + def call(self, inputs): + if self._masking: + assert len( + inputs) == 4, "inputs should be set [queries, keys, values, masks]." + queries, keys, values, masks = inputs + else: + assert len( + inputs) == 3, "inputs should be set [queries, keys, values]." + queries, keys, values = inputs + + if K.dtype(queries) != 'float32': + queries = K.cast(queries, 'float32') + if K.dtype(keys) != 'float32': + keys = K.cast(keys, 'float32') + if K.dtype(values) != 'float32': + values = K.cast(values, 'float32') + + matmul = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # MatMul + scaled_matmul = matmul / int(queries.shape[-1]) ** 0.5 # Scale + if self._masking: + scaled_matmul = self.mask(scaled_matmul, masks) # Mask(opt.) + + if self._future: + scaled_matmul = self.future_mask(scaled_matmul) + + softmax_out = K.softmax(scaled_matmul) # SoftMax + # Dropout + out = K.dropout(softmax_out, self._dropout_rate) + + outputs = K.batch_dot(out, values) + + return outputs + + def compute_output_shape(self, input_shape): + return input_shape + + +class MultiHeadAttention(Layer): + + def __init__(self, n_heads, head_dim, dropout_rate=.1, masking=True, future=False, trainable=True, **kwargs): + self._n_heads = n_heads + self._head_dim = head_dim + self._dropout_rate = dropout_rate + self._masking = masking + self._future = future + self._trainable = trainable + super(MultiHeadAttention, self).__init__(**kwargs) + + def build(self, input_shape): + self._weights_queries = self.add_weight( + shape=(input_shape[0][-1], self._n_heads * self._head_dim), + initializer='glorot_uniform', + trainable=self._trainable, + name='weights_queries') + self._weights_keys = self.add_weight( + shape=(input_shape[1][-1], self._n_heads * self._head_dim), + initializer='glorot_uniform', + trainable=self._trainable, + name='weights_keys') + self._weights_values = self.add_weight( + shape=(input_shape[2][-1], self._n_heads * self._head_dim), + initializer='glorot_uniform', + trainable=self._trainable, + name='weights_values') + super(MultiHeadAttention, self).build(input_shape) + + def call(self, inputs): + if self._masking: + assert len( + inputs) == 4, "inputs should be set [queries, keys, values, masks]." + queries, keys, values, masks = inputs + else: + assert len( + inputs) == 3, "inputs should be set [queries, keys, values]." + queries, keys, values = inputs + + queries_linear = K.dot(queries, self._weights_queries) + keys_linear = K.dot(keys, self._weights_keys) + values_linear = K.dot(values, self._weights_values) + + queries_multi_heads = tf.concat( + tf.split(queries_linear, self._n_heads, axis=2), axis=0) + keys_multi_heads = tf.concat( + tf.split(keys_linear, self._n_heads, axis=2), axis=0) + values_multi_heads = tf.concat( + tf.split(values_linear, self._n_heads, axis=2), axis=0) + + if self._masking: + att_inputs = [queries_multi_heads, + keys_multi_heads, values_multi_heads, masks] + else: + att_inputs = [queries_multi_heads, + keys_multi_heads, values_multi_heads] + + attention = ScaledDotProductAttention( + masking=self._masking, future=self._future, dropout_rate=self._dropout_rate) + att_out = attention(att_inputs) + + outputs = tf.concat(tf.split(att_out, self._n_heads, axis=0), axis=2) + + return outputs + + def compute_output_shape(self, input_shape): + return input_shape + + +class PositionWiseFeedForward(Layer): + + def __init__(self, model_dim, inner_dim, trainable=True, **kwargs): + self._model_dim = model_dim + self._inner_dim = inner_dim + self._trainable = trainable + super(PositionWiseFeedForward, self).__init__(**kwargs) + + def build(self, input_shape): + self.weights_inner = self.add_weight( + shape=(input_shape[-1], self._inner_dim), + initializer='glorot_uniform', + trainable=self._trainable, + name="weights_inner") + self.weights_out = self.add_weight( + shape=(self._inner_dim, self._model_dim), + initializer='glorot_uniform', + trainable=self._trainable, + name="weights_out") + self.bais_inner = self.add_weight( + shape=(self._inner_dim,), + initializer='uniform', + trainable=self._trainable, + name="bais_inner") + self.bais_out = self.add_weight( + shape=(self._model_dim,), + initializer='uniform', + trainable=self._trainable, + name="bais_out") + super(PositionWiseFeedForward, self).build(input_shape) + + def call(self, inputs): + if K.dtype(inputs) != 'float32': + inputs = K.cast(inputs, 'float32') + inner_out = K.relu(K.dot(inputs, self.weights_inner) + self.bais_inner) + outputs = K.dot(inner_out, self.weights_out) + self.bais_out + return outputs + + def compute_output_shape(self, input_shape): + return self._model_dim + + +class LayerNormalization(Layer): + + def __init__(self, epsilon=1e-8, **kwargs): + self._epsilon = epsilon + super(LayerNormalization, self).__init__(**kwargs) + + def build(self, input_shape): + self.beta = self.add_weight( + shape=(input_shape[-1],), + initializer='zero', + name='beta') + self.gamma = self.add_weight( + shape=(input_shape[-1],), + initializer='one', + name='gamma') + super(LayerNormalization, self).build(input_shape) + + def call(self, inputs): + mean, variance = tf.nn.moments(inputs, [-1], keepdims=True) + normalized = (inputs - mean) / ((variance + self._epsilon) ** 0.5) + outputs = self.gamma * normalized + self.beta + return outputs + + def compute_output_shape(self, input_shape): + return input_shape + + +class Add(Layer): + + def __init__(self, **kwargs): + super(Add, self).__init__(**kwargs) + + def call(self, inputs): + input_a, input_b = inputs + return input_a + input_b + + def compute_output_shape(self, input_shape): + return input_shape[0] + + +if __name__ == "__main__": + from tensorflow.keras.models import Model + from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D + from tensorflow.keras.optimizers import Adam + from tensorflow.keras.callbacks import EarlyStopping + from tensorflow.keras.datasets import imdb + from tensorflow.keras.preprocessing import sequence + from tensorflow.keras.utils import to_categorical + + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + + vocab_size = 5000 + max_len = 256 + model_dim = 512 + batch_size = 128 + epochs = 10 + + print("Data downloading and pre-processing ... ") + (x_train, y_train), (x_test, y_test) = imdb.load_data( + maxlen=max_len, num_words=vocab_size) + x_train = sequence.pad_sequences(x_train, maxlen=max_len) + x_test = sequence.pad_sequences(x_test, maxlen=max_len) + x_train_masks = tf.equal(x_train, 0) + x_test_masks = tf.equal(x_test, 0) + y_train = to_categorical(y_train) + y_test = to_categorical(y_test) + + print('Model building ... ') + inputs = Input(shape=(max_len,), name="inputs") + masks = Input(shape=(max_len,), name='masks') + embeddings = Embedding(vocab_size, model_dim)(inputs) + encodings = PositionEncoding(model_dim)(embeddings) + encodings = Add()([embeddings, encodings]) + x = MultiHeadAttention(8, 64)([encodings, encodings, encodings, masks]) + x = GlobalAveragePooling1D()(x) + x = Dropout(0.2)(x) + x = Dense(10, activation='relu')(x) + outputs = Dense(2, activation='softmax')(x) + + model = Model(inputs=[inputs, masks], outputs=outputs) + model.compile(optimizer=Adam(beta_1=0.9, beta_2=0.98, epsilon=1e-9), + loss='categorical_crossentropy', metrics=['accuracy']) + + print("Model Training ... ") + es = EarlyStopping(patience=5) + model.fit([x_train, x_train_masks], y_train, + batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[es]) + + test_metrics = model.evaluate( + [x_test, x_test_masks], y_test, batch_size=batch_size, verbose=0) + print("loss on Test: %.4f" % test_metrics[0]) + print("accu on Test: %.4f" % test_metrics[1]) diff --git a/transformer.py b/transformer.py new file mode 100644 index 0000000..5deea38 --- /dev/null +++ b/transformer.py @@ -0,0 +1,193 @@ +''' +@Description: +@version: +@License: MIT +@Author: Wang Yao +@Date: 2020-03-23 19:42:15 +@LastEditors: Wang Yao +@LastEditTime: 2020-03-27 17:50:33 +''' +import os +import numpy as np +import tensorflow as tf +import tensorflow.keras.backend as K +from tensorflow.keras.callbacks import Callback +from layers import PositionEncoding +from layers import MultiHeadAttention, PositionWiseFeedForward +from layers import Add, LayerNormalization + +tf.config.experimental_run_functions_eagerly(True) + + +class Transformer(tf.keras.layers.Layer): + + def __init__(self, vocab_size, model_dim, + n_heads=8, encoder_stack=6, decoder_stack=6, feed_forward_size=2048, dropout_rate=0.1, **kwargs): + self._vocab_size = vocab_size + self._model_dim = model_dim + self._n_heads = n_heads + self._encoder_stack = encoder_stack + self._decoder_stack = decoder_stack + self._feed_forward_size = feed_forward_size + self._dropout_rate = dropout_rate + super(Transformer, self).__init__(**kwargs) + + def build(self, input_shape): + self.embeddings = self.add_weight( + shape=(self._vocab_size, self._model_dim), + initializer='glorot_uniform', + trainable=True, + name="embeddings") + super(Transformer, self).build(input_shape) + + def encoder(self, inputs): + if K.dtype(inputs) != 'int32': + inputs = K.cast(inputs, 'int32') + + masks = K.equal(inputs, 0) + # Embeddings + embeddings = K.gather(self.embeddings, inputs) + embeddings *= self._model_dim ** 0.5 # Scale + # Position Encodings + position_encodings = PositionEncoding(self._model_dim)(embeddings) + # Embedings + Postion-encodings + encodings = embeddings + position_encodings + # Dropout + encodings = K.dropout(encodings, self._dropout_rate) + + for i in range(self._encoder_stack): + # Multi-head-Attention + attention = MultiHeadAttention( + self._n_heads, self._model_dim // self._n_heads) + attention_input = [encodings, encodings, encodings, masks] + attention_out = attention(attention_input) + # Add & Norm + attention_out += encodings + attention_out = LayerNormalization()(attention_out) + # Feed-Forward + ff = PositionWiseFeedForward( + self._model_dim, self._feed_forward_size) + ff_out = ff(attention_out) + # Add & Norm + ff_out += attention_out + encodings = LayerNormalization()(ff_out) + + return encodings, masks + + def decoder(self, inputs): + decoder_inputs, encoder_encodings, encoder_masks = inputs + if K.dtype(decoder_inputs) != 'int32': + decoder_inputs = K.cast(decoder_inputs, 'int32') + + decoder_masks = K.equal(decoder_inputs, 0) + # Embeddings + embeddings = K.gather(self.embeddings, decoder_inputs) + embeddings *= self._model_dim ** 0.5 # Scale + # Position Encodings + position_encodings = PositionEncoding(self._model_dim)(embeddings) + # Embedings + Postion-encodings + encodings = embeddings + position_encodings + # Dropout + encodings = K.dropout(encodings, self._dropout_rate) + + for i in range(self._decoder_stack): + # Masked-Multi-head-Attention + masked_attention = MultiHeadAttention( + self._n_heads, self._model_dim // self._n_heads, future=True) + masked_attention_input = [encodings, + encodings, encodings, decoder_masks] + masked_attention_out = masked_attention(masked_attention_input) + # Add & Norm + masked_attention_out += encodings + masked_attention_out = LayerNormalization()(masked_attention_out) + + # Multi-head-Attention + attention = MultiHeadAttention( + self._n_heads, self._model_dim // self._n_heads) + attention_input = [masked_attention_out, + encoder_encodings, encoder_encodings, encoder_masks] + attention_out = attention(attention_input) + # Add & Norm + attention_out += masked_attention_out + attention_out = LayerNormalization()(attention_out) + + # Feed-Forward + ff = PositionWiseFeedForward( + self._model_dim, self._feed_forward_size) + ff_out = ff(attention_out) + # Add & Norm + ff_out += attention_out + encodings = LayerNormalization()(ff_out) + + # Pre-Softmax 与 Embeddings 共享参数 + linear_projection = K.dot(encodings, K.transpose(self.embeddings)) + outputs = K.softmax(linear_projection) + return outputs + + def call(self, inputs): + encoder_inputs, decoder_inputs = inputs + encoder_encodings, encoder_masks = self.encoder(encoder_inputs) + encoder_outputs = self.decoder( + [decoder_inputs, encoder_encodings, encoder_masks]) + return encoder_outputs + + def compute_output_shape(self, input_shape): + return (input_shape[0][0], input_shape[0][1], self._vocab_size) + + +class Noam(Callback): + + def __init__(self, model_dim, step_num=0, warmup_steps=4000, verbose=False, **kwargs): + self._model_dim = model_dim + self._step_num = step_num + self._warmup_steps = warmup_steps + self.verbose = verbose + super(Noam, self).__init__(**kwargs) + + def on_train_begin(self, logs=None): + logs = logs or {} + init_lr = self._model_dim ** -.5 * self._warmup_steps ** -1.5 + K.set_value(self.model.optimizer.lr, init_lr) + + def on_batch_end(self, epoch, logs=None): + logs = logs or {} + self._step_num += 1 + lrate = self._model_dim ** -.5 * \ + K.minimum(self._step_num ** -.5, self._step_num * + self._warmup_steps ** -1.5) + K.set_value(self.model.optimizer.lr, lrate) + + def on_epoch_begin(self, epoch, logs=None): + if self.verbose: + lrate = K.get_value(self.model.optimizer.lr) + print(f"epoch {epoch} lr: {lrate}") + + def on_epoch_end(self, epoch, logs=None): + logs = logs or {} + logs['lr'] = K.get_value(self.model.optimizer.lr) + + +def label_smoothing(inputs, epsilon=0.1): + + output_dim = inputs.shape[-1] + smooth_label = (1 - epsilon) * inputs + (epsilon / output_dim) + return smooth_label + + +if __name__ == "__main__": + from tensorflow.keras.models import Model + from tensorflow.keras.layers import Input + from tensorflow.keras.utils import plot_model + + vocab_size = 5000 + max_seq_len = 256 + model_dim = 512 + + encoder_inputs = Input(shape=(max_seq_len,), name='encoder_inputs') + decoder_inputs = Input(shape=(max_seq_len,), name='decoder_inputs') + outputs = Transformer(vocab_size, model_dim)( + [encoder_inputs, decoder_inputs]) + model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs) + + model.summary() + plot_model(model, 'transformer.png')