diff --git a/LSTM.py b/LSTM.py index 5447f6f..5f17827 100644 --- a/LSTM.py +++ b/LSTM.py @@ -1,4 +1,95 @@ -# 后续可以外部导入最佳网络结构 +# %% +import os +from tqdm import tqdm +import numpy as np +import pandas as pd +import argparse +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers +from tensorflow.keras import losses +from tensorflow.keras import optimizers +from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler +from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Dropout, Concatenate, Bidirectional, GlobalMaxPooling1D +from tensorflow.keras.models import Model, Sequential +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.utils import to_categorical +from gensim.models import Word2Vec, KeyedVectors +from layers import Add, LayerNormalization +from layers import MultiHeadAttention, PositionWiseFeedForward +from layers import PositionEncoding +from tensorflow.keras.callbacks import Callback +import tensorflow.keras.backend as K + +# %% + + +def get_data(): + DATA = {} + DATA['X1_train'] = np.load('tmp/inputs_0.npy', allow_pickle=True) + DATA['X1_val'] = np.load('tmp/inputs_1.npy', allow_pickle=True) + DATA['X2_train'] = np.load('tmp/inputs_2.npy', allow_pickle=True) + DATA['X2_val'] = np.load('tmp/inputs_3.npy', allow_pickle=True) + DATA['X3_train'] = np.load('tmp/inputs_4.npy', allow_pickle=True) + DATA['X3_val'] = np.load('tmp/inputs_5.npy', allow_pickle=True) + DATA['X4_train'] = np.load('tmp/inputs_6.npy', allow_pickle=True) + DATA['X4_val'] = np.load('tmp/inputs_7.npy', allow_pickle=True) + DATA['X5_train'] = np.load('tmp/inputs_8.npy', allow_pickle=True) + DATA['X5_val'] = np.load('tmp/inputs_9.npy', allow_pickle=True) + DATA['X6_train'] = np.load('tmp/inputs_10.npy', allow_pickle=True) + DATA['X6_val'] = np.load('tmp/inputs_11.npy', allow_pickle=True) + DATA['Y_gender_train'] = np.load('tmp/gender_0.npy', allow_pickle=True) + DATA['Y_gender_val'] = np.load('tmp/gender_1.npy', allow_pickle=True) + DATA['Y_age_train'] = np.load('tmp/age_0.npy', allow_pickle=True) + DATA['Y_age_val'] = np.load('tmp/age_1.npy', allow_pickle=True) + DATA['creative_id_emb'] = np.load( + 'tmp/embeddings_0.npy', allow_pickle=True) + DATA['ad_id_emb'] = np.load( + 'tmp/embeddings_1.npy', allow_pickle=True) + DATA['product_id_emb'] = np.load( + 'tmp/embeddings_2.npy', allow_pickle=True) + DATA['advertiser_id_emb'] = np.load( + 'tmp/embeddings_3.npy', allow_pickle=True) + DATA['industry_emb'] = np.load( + 'tmp/embeddings_4.npy', allow_pickle=True) + DATA['product_category_emb'] = np.load( + 'tmp/embeddings_5.npy', allow_pickle=True) + + # DATA['Y_age_train'] = pd.read_csv( + # 'data/train_preliminary/user.csv').age.values-1 + # DATA['Y_age_val'] = pd.read_csv( + # 'data/train_preliminary/user.csv').age.values-1 + # DATA['Y_gender_train'] = pd.read_csv( + # 'data/train_preliminary/user.csv').gender.values-1 + # DATA['Y_gender_val'] = pd.read_csv( + # 'data/train_preliminary/user.csv').gender.values-1 + + return DATA + + +# %% +DATA = get_data() + +cols_to_emb = ['creative_id', 'ad_id', 'advertiser_id', + 'product_id', 'industry', 'product_category'] + +emb_matrix_dict = { + 'creative_id': [DATA['creative_id_emb'].astype('float32')], + 'ad_id': [DATA['ad_id_emb'].astype('float32')], + 'product_id': [DATA['product_id_emb'].astype('float32')], + 'advertiser_id': [DATA['advertiser_id_emb'].astype('float32')], + 'industry': [DATA['industry_emb'].astype('float32')], + 'product_category': [DATA['product_category_emb'].astype('float32')], +} + +conv1d_info_dict = {'creative_id': 128, 'ad_id': 128, 'advertiser_id': 128, + 'industry': 128, 'product_category': 128, + 'product_id': 128, 'time': 32, 'click_times': -1} +# %% +seq_length_creative_id = 100 +labeli = 'age' +# %% class BiLSTM_Model: @@ -12,7 +103,7 @@ def __init__(self, n_units): def get_emb_layer(self, emb_matrix, input_length, trainable): ''' - embedding层 index 从maxtrix 里 lookup出向量 + embedding层 index 从maxtrix 里 lookup出向量 ''' embedding_dim = emb_matrix.shape[-1] input_dim = emb_matrix.shape[0] @@ -22,7 +113,7 @@ def get_emb_layer(self, emb_matrix, input_length, trainable): trainable=trainable) return emb_layer - def get_input_layer(self, name=None, dtype="int32"): + def get_input_layer(self, name=None, dtype="int64"): ''' input层 字典索引序列 ''' @@ -114,27 +205,69 @@ def create_model(self, num_class, labeli): print(model.summary()) optimizer = keras.optimizers.Adam(1e-3) model.compile(optimizer=optimizer, - loss='sparse_categorical_crossentropy', + # loss='sparse_categorical_crossentropy', + loss=tf.keras.losses.CategoricalCrossentropy( + from_logits=False), metrics=['accuracy']) return model -earlystop_callback = tf.keras.callbacks.EarlyStopping( - monitor="val_accuracy", - min_delta=0.00001, - patience=3, - verbose=1, - mode="max", - baseline=None, - restore_best_weights=True, +# %% +model = BiLSTM_Model(n_units=128).create_model(10, 'age') + +# %% +# train_examples = 720000 +# val_examples = 180000 +train_examples = 810000 +val_examples = 90000 +model.fit( + { + 'creative_id': DATA['X1_train'][:train_examples], + 'ad_id': DATA['X2_train'][:train_examples], + 'product_id': DATA['X3_train'][:train_examples], + 'advertiser_id': DATA['X4_train'][:train_examples], + 'industry': DATA['X5_train'][:train_examples], + 'product_category': DATA['X6_train'][:train_examples] + }, + { + # 'gender': DATA['Y_gender_train'][:train_examples], + 'age': DATA['Y_age_train'][:train_examples], + }, + validation_data=( + { + 'creative_id': DATA['X1_val'][:val_examples], + 'ad_id': DATA['X2_val'][:val_examples], + 'product_id': DATA['X3_val'][:val_examples], + 'advertiser_id': DATA['X4_val'][:val_examples], + 'industry': DATA['X5_val'][:val_examples], + 'product_category': DATA['X6_val'][:val_examples] + }, + { + # 'gender': DATA['Y_gender_val'][:val_examples], + 'age': DATA['Y_age_val'][:val_examples], + }, + ), + epochs=10, + batch_size=1024, + # callbacks=[checkpoint, earlystop_callback, reduce_lr_callback], ) +# %% +# earlystop_callback = tf.keras.callbacks.EarlyStopping( +# monitor="val_accuracy", +# min_delta=0.00001, +# patience=3, +# verbose=1, +# mode="max", +# baseline=None, +# restore_best_weights=True, +# ) -csv_log_callback = tf.keras.callbacks.CSVLogger( - filename='logs_save/{}_nn_v0621_{}d_bilstm.log'.format(labeli, count), separator=",", append=True) +# csv_log_callback = tf.keras.callbacks.CSVLogger( +# filename='logs_save/{}_nn_v0621_{}d_bilstm.log'.format(labeli, count), separator=",", append=True) -reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', - factor=0.5, - patience=1, - min_lr=0.0000001) +# reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', +# factor=0.5, +# patience=1, +# min_lr=0.0000001) -callbacks = [earlystop_callback, csv_log_callback, reduce_lr_callback] +# callbacks = [earlystop_callback, csv_log_callback, reduce_lr_callback] diff --git a/Transformer_keras_6_input.py b/Transformer_keras_6_input.py index e68575d..d02bda2 100644 --- a/Transformer_keras_6_input.py +++ b/Transformer_keras_6_input.py @@ -82,15 +82,12 @@ NUM_industry = 332 NUM_product_category = 18 -vocab_size = 5000 -max_seq_len = 100 - -LEN_creative_id = 100 -LEN_ad_id = 100 -LEN_product_id = 100 -LEN_advertiser_id = 100 -LEN_industry = 100 -LEN_product_category = 100 +LEN_creative_id = 150 +LEN_ad_id = 150 +LEN_product_id = 150 +LEN_advertiser_id = 150 +LEN_industry = 150 +LEN_product_category = 150 # %% @@ -207,64 +204,64 @@ def get_gender_model(DATA): def get_age_model(DATA): feed_forward_size = 2048 - max_seq_len = 100 - model_dim = 128*6 + max_seq_len = 150 + model_dim = 256+256+64+32+8+16 input_creative_id = Input(shape=(max_seq_len,), name='creative_id') x1 = Embedding(input_dim=NUM_creative_id+1, - output_dim=128, + output_dim=256, weights=[DATA['creative_id_emb']], trainable=args.not_train_embedding, # trainable=False, - input_length=100, + input_length=max_seq_len, mask_zero=True)(input_creative_id) # encodings = PositionEncoding(model_dim)(x1) # encodings = Add()([embeddings, encodings]) input_ad_id = Input(shape=(max_seq_len,), name='ad_id') x2 = Embedding(input_dim=NUM_ad_id+1, - output_dim=128, + output_dim=256, weights=[DATA['ad_id_emb']], trainable=args.not_train_embedding, # trainable=False, - input_length=100, + input_length=max_seq_len, mask_zero=True)(input_ad_id) input_product_id = Input(shape=(max_seq_len,), name='product_id') x3 = Embedding(input_dim=NUM_product_id+1, - output_dim=128, + output_dim=256, weights=[DATA['product_id_emb']], trainable=args.not_train_embedding, # trainable=False, - input_length=100, + input_length=max_seq_len, mask_zero=True)(input_product_id) input_advertiser_id = Input(shape=(max_seq_len,), name='advertiser_id') x4 = Embedding(input_dim=NUM_advertiser_id+1, - output_dim=128, + output_dim=256, weights=[DATA['advertiser_id_emb']], trainable=args.not_train_embedding, # trainable=False, - input_length=100, + input_length=max_seq_len, mask_zero=True)(input_advertiser_id) input_industry = Input(shape=(max_seq_len,), name='industry') x5 = Embedding(input_dim=NUM_industry+1, - output_dim=128, + output_dim=256, weights=[DATA['industry_emb']], trainable=args.not_train_embedding, # trainable=False, - input_length=100, + input_length=max_seq_len, mask_zero=True)(input_industry) input_product_category = Input( shape=(max_seq_len,), name='product_category') x6 = Embedding(input_dim=NUM_product_category+1, - output_dim=128, + output_dim=256, weights=[DATA['product_category_emb']], trainable=args.not_train_embedding, # trainable=False, - input_length=100, + input_length=max_seq_len, mask_zero=True)(input_product_category) # (bs, 100, 128*2) @@ -273,7 +270,8 @@ def get_age_model(DATA): masks = tf.equal(input_creative_id, 0) # (bs, 100, 128*2) - attention_out = MultiHeadAttention(8, 96)( + # concat之后是632 + attention_out = MultiHeadAttention(8, 76)( [encodings, encodings, encodings, masks]) # Add & Norm diff --git a/Transformer_keras_8_input.py b/Transformer_keras_8_input.py index e68575d..2d999c6 100644 --- a/Transformer_keras_8_input.py +++ b/Transformer_keras_8_input.py @@ -1,16 +1,4 @@ # %% -# 生成词嵌入文件 -from layers import Add, LayerNormalization -from layers import MultiHeadAttention, PositionWiseFeedForward -from layers import PositionEncoding -from tensorflow.keras.callbacks import Callback -import tensorflow.keras.backend as K -from mymail import mail -from gensim.models import Word2Vec, KeyedVectors -from tensorflow.keras.utils import to_categorical -from tensorflow.keras.preprocessing.text import Tokenizer -from tensorflow.keras.preprocessing.sequence import pad_sequences -from tensorflow.keras.models import Model, Sequential import os from tqdm import tqdm import numpy as np @@ -28,6 +16,12 @@ from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.utils import to_categorical from gensim.models import Word2Vec, KeyedVectors +from layers import Add, LayerNormalization +from layers import MultiHeadAttention, PositionWiseFeedForward +from layers import PositionEncoding +from tensorflow.keras.callbacks import Callback +import tensorflow.keras.backend as K + from mymail import mail @@ -428,6 +422,26 @@ def get_train(feature_name, vocab_size, len_feature): DATA['X6_val'] = X6_train[train_examples:] DATA['product_category_emb'] = product_category_emb + # 第七个输入 + print('获取 time 特征') + X7_train, tokenizer = get_train( + 'time', NUM_time+1, LEN_time) + time_emb = get_embedding('time', tokenizer) + + DATA['X7_train'] = X7_train[:train_examples] + DATA['X7_val'] = X7_train[train_examples:] + DATA['time_emb'] = time_emb + + # 第八个输入 + print('获取 click_times 特征') + X6_train, tokenizer = get_train( + 'click_times', NUM_click_times+1, LEN_click_times) + click_times_emb = get_embedding('click_times', tokenizer) + + DATA['X6_train'] = X6_train[:train_examples] + DATA['X6_val'] = X6_train[train_examples:] + DATA['click_times_emb'] = click_times_emb + return DATA diff --git a/tmp.py b/tmp.py index 4aa3cb4..ad2f0e7 100644 --- a/tmp.py +++ b/tmp.py @@ -1,64 +1,55 @@ -from numpy import asarray -from numpy import zeros -from keras.preprocessing.text import Tokenizer -from keras.preprocessing.sequence import pad_sequences -from keras.models import Sequential -from keras.layers import Dense -from keras.layers import Flatten -from keras.layers import Embedding -# define documents -docs = ['Well done!', - 'Good work', - 'Great effort', - 'nice work', - 'Excellent!', - 'Weak', - 'Poor effort!', - 'not good', - 'poor work', - 'Could have done better.'] -# define class labels -labels = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0] -# prepare tokenizer -t = Tokenizer() -t.fit_on_texts(docs) -vocab_size = len(t.word_index) + 1 -# integer encode the documents -encoded_docs = t.texts_to_sequences(docs) -print(encoded_docs) -# pad documents to a max length of 4 words -max_length = 4 -padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') -print(padded_docs) -# load the whole embedding into memory -embeddings_index = dict() -f = open('../glove_data/glove.6B/glove.6B.100d.txt') -for line in f: - values = line.split() - word = values[0] - coefs = asarray(values[1:], dtype='float32') - embeddings_index[word] = coefs -f.close() -print('Loaded %s word vectors.' % len(embeddings_index)) -# create a weight matrix for words in training docs -embedding_matrix = zeros((vocab_size, 100)) -for word, i in t.word_index.items(): - embedding_vector = embeddings_index.get(word) - if embedding_vector is not None: - embedding_matrix[i] = embedding_vector -# define model -model = Sequential() -e = Embedding(vocab_size, 100, weights=[ - embedding_matrix], input_length=4, trainable=False) -model.add(e) -model.add(Flatten()) -model.add(Dense(1, activation='sigmoid')) -# compile the model -model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) -# summarize the model -print(model.summary()) -# fit the model -model.fit(padded_docs, labels, epochs=50, verbose=0) -# evaluate the model -loss, accuracy = model.evaluate(padded_docs, labels, verbose=0) -print('Accuracy: %f' % (accuracy*100)) +import pandas as pd +import numpy as np + +DATA = {} +DATA['ad_id_emb'] = np.load( + 'tmp/embeddings_0.npy', allow_pickle=True) +arr = DATA['ad_id_emb'] + +result = [] +for i in range(arr.shape[-1]): + result.append([np.mean(arr[:, i]), np.std(arr[:, i])]) +dfi = pd.DataFrame(result, columns=['mean', 'std']) +print(dfi.describe().T) +# from gensim.models import Word2Vec +# from gensim.models.callbacks import CallbackAny2Vec + + +# class LossCallback(CallbackAny2Vec): +# '''Callback to print loss after each epoch.''' + +# def __init__(self): +# self.epoch = 0 +# self.loss_to_be_subed = 0 + +# def on_epoch_end(self, model): +# loss = model.get_latest_training_loss() +# loss_now = loss - self.loss_to_be_subed +# self.loss_to_be_subed = loss +# print('Loss after epoch {}: {}'.format(self.epoch, loss_now)) +# self.epoch += 1 + + +# model = Word2Vec(common_texts, size=100, window=5, min_count=1, +# compute_loss=True, callbacks=[LossCallback()]) + + +# tmp = df.groupby(sentence_id, +# as_index=False)[word_id].agg({list_col_nm: list}) +# sentences = tmp[list_col_nm].values.tolist() +# all_words_vocabulary = df[word_id].unique().tolist() +# del tmp[list_col_nm] +# gc.collect() + +# if embedding_type == 'w2v': +# model = Word2Vec( +# sentences, +# size=emb_size, +# window=150, +# workers=n_jobs, +# min_count=1, # 最低词频. min_count>1会出现OOV +# sg=sg, # 1 for skip-gram; otherwise CBOW. +# hs=hs, # If 1, hierarchical softmax will be used for model training +# negative=negative, # hs=1 + negative 负采样 +# iter=epoch, +# seed=0) diff --git a/word2vec_creative_id_new.py b/word2vec_creative_id_new.py new file mode 100644 index 0000000..6a91b90 --- /dev/null +++ b/word2vec_creative_id_new.py @@ -0,0 +1,128 @@ +# 通过用户访问的creative_id的序列,生成每个creative_id的词嵌入 +# %% +import pandas as pd +import numpy as np +from tqdm import tqdm +from gensim.test.utils import datapath +from gensim.models.word2vec import LineSentence +from gensim.models import Word2Vec +from gensim.models import KeyedVectors +from gensim.test.utils import common_texts, get_tmpfile +import pickle +from mymail import mail +# %% +df = pd.read_csv( + 'data/click_log_ad.csv') +# df_test = pd.read_csv('data/test/clicklog_ad_user_test.csv') +columns = ['user_id', 'creative_id'] +# frame = [df_train[columns], df_test[columns]] +# df_train_test = pd.concat(frame, ignore_index=True) +# df_train_test_sorted = df_train_test.sort_values( +# ["user_id", "time"], ascending=(True, True)) +# %% +with open('word2vec/df_train_test_sorted.pkl', 'wb') as f: + pickle.dump(df_train_test_sorted, f) +with open('word2vec/df_train_test_sorted.pkl', 'rb') as f: + df_train_test_sorted = pickle.load(f) +# %% +userid_creative_id = df.groupby( + 'user_id')['creative_id'].apply(list).reset_index(name='creative_id') +# %% +with open('word2vec_new/creative_id.txt', 'w')as f: + for ids in userid_creative_id.creative_ids: + ids = [str(e) for e in ids] + line = ' '.join(ids) + f.write(line+'\n') +# %% +sentences = LineSentence('word2vec_new/creative_id.txt') +dimension_embedding = 128 + +model = Word2Vec(sentences, size=dimension_embedding, + window=10, min_count=1, workers=-1, iter=10, sg=1) +model.save("word2vec/word2vec_creative_id.model") +path = "word2vec/wordvectors_creative_id.kv" +model.wv.save(path) +print('Save embedding done!!!') +# %% +path = "word2vec/wordvectors_creative_id.kv" +wv = KeyedVectors.load(path, mmap='r') +dimension_embedding = 128 +columns = ['c'+str(i) for i in range(dimension_embedding)] +data = {} +for col_name in columns: + data[col_name] = pd.Series([], dtype='float') +df_creative_id_embedding = pd.DataFrame(data) + +# %% +data = {} +for key in tqdm(wv.vocab): + data[int(key)] = wv[key].tolist() +# %% +df_creative_id_embedding = pd.DataFrame.from_dict( + data, orient='index', + columns=columns) +df_creative_id_embedding['creative_id'] = df_creative_id_embedding.index +# %% +df_creative_id_embedding.to_hdf( + 'word2vec/df_creative_id_embedding.h5', + key='df_creative_id_embedding', mode='w') +mail('save h5 done') +# %% +df_creative_id_embedding = pd.read_hdf( + 'word2vec/df_creative_id_embedding.h5', + key='df_creative_id_embedding', mode='r') +# %% +# %% +try: + userid_creative_id_embedding = pd.merge( + df_train_test_sorted, df_creative_id_embedding, on='creative_id', how='left') + userid_creative_id_embedding.drop( + columns=['creative_id', 'time'], inplace=True) + userid_creative_id_embedding.groupby('user_id').mean().to_csv( + 'word2vec/creative_id.csv', header=True, index=False) + mail('to csv done') +except: + mail('failed') +# %% +# columns = ['c'+str(i) for i in range(128)] +# data = {} +# for col_name in columns: +# data[col_name] = pd.Series([], dtype='float') +# df_user_embedding = pd.DataFrame(data) +# # %% +# # this will take 24 hours!!! +# # debug = 0 +# for user in tqdm(range(len(seq_creative_id))): +# user_em = df_creative_id_embedding.loc[seq_creative_id[user]].mean() +# # df_user_embedding = df_user_embedding.append(user_em, ignore_index=True) + # debug += 1 + # if debug == 10: + # break +# debug = 0 +# frames = [] +# for creative_id in tqdm.tqdm(wv.vocab): +# creativeid_embedding = wv[creative_id] +# tmp = pd.DataFrame( +# creativeid_embedding.reshape(-1, len(creativeid_embedding)), +# columns=columns[:-1]) +# # df_creativeid_embedding = df_creativeid_embedding.append(tmp) +# frames.append(tmp) +# if len(frames) == 1000000: +# # frames = [df_creativeid_embedding, tmp] +# frames = [df_creativeid_embedding]+frames +# df_creativeid_embedding = pd.concat(frames) +# frames = [] +# df_creativeid_embedding.iloc[-1, -1] = str(creative_id) +# %% +# if len(frames) != 0: +# frames = [df_creativeid_embedding]+frames +# df_creativeid_embedding = pd.concat(frames) +# df_creativeid_embedding.to_hdf('data/clicklog_ad_user_train_eval_test.h5', +# key='df_creativeid_embedding', mode='w') + +# debug += 1 +# if debug == 10: +# break + + +# %% diff --git a/word2vec/.gitignore b/word2vec_old/.gitignore similarity index 100% rename from word2vec/.gitignore rename to word2vec_old/.gitignore diff --git a/word2vec_old_old/.gitignore b/word2vec_old_old/.gitignore new file mode 100644 index 0000000..1c471ba --- /dev/null +++ b/word2vec_old_old/.gitignore @@ -0,0 +1,8 @@ +# .gitignore sample +################### + +# Ignore all files in this dir... +* + +# ... except for this one. +!.gitignore \ No newline at end of file