Skip to content

Commit

Permalink
update predict transformer
Browse files Browse the repository at this point in the history
  • Loading branch information
sunlanchang committed Jun 21, 2020
1 parent 4347787 commit e55be5c
Show file tree
Hide file tree
Showing 2 changed files with 690 additions and 46 deletions.
146 changes: 100 additions & 46 deletions Transformer_keras_6_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,24 @@
default=False)
parser.add_argument('--not_train_embedding', action='store_false',
help='从npy文件加载数据',

default=True)
parser.add_argument('--gender', action='store_true',
help='gender model',
default=False)
parser.add_argument('--age', action='store_true',
help='age model',
default=False)

parser.add_argument('--batch_size', type=int,
help='batch size大小',
default=256)
parser.add_argument('--epoch', type=int,
help='epoch 大小',
default=5)
parser.add_argument('--predict', action='store_true',
help='从npy文件加载数据',
default=False)

parser.add_argument('--num_transformer', type=int,
help='transformer层数',
Expand Down Expand Up @@ -229,7 +234,7 @@ def get_age_model(DATA):

input_product_id = Input(shape=(max_seq_len,), name='product_id')
x3 = Embedding(input_dim=NUM_product_id+1,
output_dim=256,
output_dim=32,
weights=[DATA['product_id_emb']],
trainable=args.not_train_embedding,
# trainable=False,
Expand All @@ -238,7 +243,7 @@ def get_age_model(DATA):

input_advertiser_id = Input(shape=(max_seq_len,), name='advertiser_id')
x4 = Embedding(input_dim=NUM_advertiser_id+1,
output_dim=256,
output_dim=64,
weights=[DATA['advertiser_id_emb']],
trainable=args.not_train_embedding,
# trainable=False,
Expand All @@ -247,7 +252,7 @@ def get_age_model(DATA):

input_industry = Input(shape=(max_seq_len,), name='industry')
x5 = Embedding(input_dim=NUM_industry+1,
output_dim=256,
output_dim=16,
weights=[DATA['industry_emb']],
trainable=args.not_train_embedding,
# trainable=False,
Expand All @@ -257,7 +262,7 @@ def get_age_model(DATA):
input_product_category = Input(
shape=(max_seq_len,), name='product_category')
x6 = Embedding(input_dim=NUM_product_category+1,
output_dim=256,
output_dim=8,
weights=[DATA['product_category_emb']],
trainable=args.not_train_embedding,
# trainable=False,
Expand Down Expand Up @@ -313,23 +318,6 @@ def get_age_model(DATA):

def get_train_val():

# 提取词向量文件
def get_embedding(feature_name, tokenizer):
path = f'word2vec_new/{feature_name}.kv'
wv = KeyedVectors.load(path, mmap='r')
feature_tokens = list(wv.vocab.keys())
feature_name_dict = {'creative_id': 256, 'ad_id': 256, 'advertiser_id': 64,
'product_id': 32, 'product_category': 8, 'industry': 16}
embedding_dim = feature_name_dict[feature_name]
embedding_matrix = np.random.randn(
len(feature_tokens)+1, embedding_dim)
for feature in feature_tokens:
embedding_vector = wv[feature]
if embedding_vector is not None:
index = tokenizer.texts_to_sequences([feature])[0][0]
embedding_matrix[index] = embedding_vector
return embedding_matrix

# 从序列文件提取array格式数据
def get_train(feature_name, vocab_size, len_feature):
f = open(f'word2vec_new/{feature_name}.txt')
Expand All @@ -342,15 +330,38 @@ def get_train(feature_name, vocab_size, len_feature):
for text in f:
feature_seq.append(text.strip())

sequences = tokenizer.texts_to_sequences(feature_seq[:900000//1])
sequences = tokenizer.texts_to_sequences(feature_seq[:900000])
X_train = pad_sequences(
sequences, maxlen=len_feature, padding='post')
return X_train, tokenizer

# 构造输出的训练标签
# 获得age、gender标签
sequences = tokenizer.texts_to_sequences(feature_seq[900000:])
X_test = pad_sequences(
sequences, maxlen=len_feature, padding='post')
return X_train, tokenizer, X_test

# 提取词向量文件
def get_embedding(feature_name, tokenizer):
path = f'word2vec_new/{feature_name}.kv'
wv = KeyedVectors.load(path, mmap='r')
feature_tokens = list(wv.vocab.keys())
feature_name_dict = {'creative_id': 256, 'ad_id': 256, 'advertiser_id': 64,
'product_id': 32, 'product_category': 8, 'industry': 16}
embedding_dim = feature_name_dict[feature_name]
embedding_matrix = np.random.randn(
len(feature_tokens)+1, embedding_dim)
for word, i in tokenizer.word_index.items():
embedding_vector = wv[word]
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
else:
print(str(word)+' 没有找到')
return embedding_matrix

DATA = {}
# 获取test数据

# 构造输出的训练标签
# 获得age、gender标签
user_train = pd.read_csv(
'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,))
Y_gender = user_train['gender'].values
Expand All @@ -370,62 +381,69 @@ def get_train(feature_name, vocab_size, len_feature):

# 第一个输入
print('获取 creative_id 特征')
X1_train, tokenizer = get_train(
X1_train, tokenizer, X1_test = get_train(
'creative_id', NUM_creative_id+1, LEN_creative_id) # +1为了UNK的creative_id
creative_id_emb = get_embedding('creative_id', tokenizer)

DATA['X1_train'] = X1_train[:train_examples]
DATA['X1_val'] = X1_train[train_examples:]
DATA['X1_test'] = X1_test
DATA['creative_id_emb'] = creative_id_emb

# 第二个输入
print('获取 ad_id 特征')
X2_train, tokenizer = get_train(
X2_train, tokenizer, X2_test = get_train(
'ad_id', NUM_ad_id+1, LEN_ad_id)
ad_id_emb = get_embedding('ad_id', tokenizer)

DATA['X2_train'] = X2_train[:train_examples]
DATA['X2_val'] = X2_train[train_examples:]
DATA['X2_test'] = X2_test

DATA['ad_id_emb'] = ad_id_emb

# 第三个输入
print('获取 product_id 特征')
X3_train, tokenizer = get_train(
X3_train, tokenizer, X3_test = get_train(
'product_id', NUM_product_id+1, LEN_product_id)
product_id_emb = get_embedding('product_id', tokenizer)

DATA['X3_train'] = X3_train[:train_examples]
DATA['X3_val'] = X3_train[train_examples:]
DATA['X3_test'] = X3_test
DATA['product_id_emb'] = product_id_emb

# 第四个输入
print('获取 advertiser_id 特征')
X4_train, tokenizer = get_train(
X4_train, tokenizer, X4_test = get_train(
'advertiser_id', NUM_advertiser_id+1, LEN_advertiser_id)
advertiser_id_emb = get_embedding('advertiser_id', tokenizer)

DATA['X4_train'] = X4_train[:train_examples]
DATA['X4_val'] = X4_train[train_examples:]
DATA['X4_test'] = X4_test
DATA['advertiser_id_emb'] = advertiser_id_emb

# 第五个输入
print('获取 industry 特征')
X5_train, tokenizer = get_train(
X5_train, tokenizer, X5_test = get_train(
'industry', NUM_industry+1, LEN_industry)
industry_emb = get_embedding('industry', tokenizer)

DATA['X5_train'] = X5_train[:train_examples]
DATA['X5_val'] = X5_train[train_examples:]
DATA['X5_test'] = X5_test
DATA['industry_emb'] = industry_emb

# 第六个输入
print('获取 product_category 特征')
X6_train, tokenizer = get_train(
X6_train, tokenizer, X6_test = get_train(
'product_category', NUM_product_category+1, LEN_product_category)
product_category_emb = get_embedding('product_category', tokenizer)

DATA['X6_train'] = X6_train[:train_examples]
DATA['X6_val'] = X6_train[train_examples:]
DATA['X6_test'] = X6_test
DATA['product_category_emb'] = product_category_emb

return DATA
Expand All @@ -448,6 +466,12 @@ def save_npy(datas, name):
np.save(f'tmp/{name}_{i}.npy', data)
print(f'saving tmp/{name}_{i}.npy')

test = [DATA['X1_test'],
DATA['X2_test'],
DATA['X3_test'],
DATA['X4_test'],
DATA['X5_test'],
DATA['X6_test'], ]
inputs = [
DATA['X1_train'], DATA['X1_val'],
DATA['X2_train'], DATA['X2_val'],
Expand All @@ -466,6 +490,7 @@ def save_npy(datas, name):
DATA['industry_emb'],
DATA['product_category_emb'],
]
save_npy(test, 'test')
save_npy(inputs, 'inputs')
save_npy(outputs_gender, 'gender')
save_npy(outputs_age, 'age')
Expand Down Expand Up @@ -501,6 +526,13 @@ def save_npy(datas, name):
DATA['product_category_emb'] = np.load(
'tmp/embeddings_5.npy', allow_pickle=True)

DATA['X_test1'] = np.load('tmp/test_0.npy', allow_pickle=True)
DATA['X_test2'] = np.load('tmp/test_1.npy', allow_pickle=True)
DATA['X_test3'] = np.load('tmp/test_2.npy', allow_pickle=True)
DATA['X_test4'] = np.load('tmp/test_3.npy', allow_pickle=True)
DATA['X_test5'] = np.load('tmp/test_4.npy', allow_pickle=True)
DATA['X_test6'] = np.load('tmp/test_5.npy', allow_pickle=True)


# %%

Expand Down Expand Up @@ -624,21 +656,43 @@ def save_npy(datas, name):
# mail('train failed!!! ' + e)
print(e)
# %%
# model.load_weights('tmp/gender_epoch_01.hdf5')


# # %%
# if debug:
# sequences = tokenizer.texts_to_sequences(
# creative_id_seq[900000:])
# else:
# sequences = tokenizer.texts_to_sequences(
# creative_id_seq[900000:])

# X_test = pad_sequences(sequences, maxlen=LEN_creative_id)
# # %%
# y_pred = model.predict(X_test, batch_size=4096)

if args.predict:
model.load_weights('tmp/gender_epoch_01.hdf5')
y_pred = model.predict(
{
'creative_id': DATA['X1_test'],
'ad_id': DATA['X2_test'],
'product_id': DATA['X3_test'],
'advertiser_id': DATA['X4_test'],
'industry': DATA['X5_test'],
'product_category': DATA['X6_test']
},
batch_size=1024,
)
y_pred = np.argmax(y_pred, axis=1)
y_pred = y_pred.flatten()
y_pred += 1

if args.gender:
ans = pd.DataFrame({'predicted_gender': y_pred})
ans.to_csv(
'data/ans/transformer_gender.csv', header=True, columns=['predicted_gender'], index=False)
elif args.age:
ans = pd.DataFrame({'predicted_age': y_pred})
ans.to_csv(
'data/ans/transformer_age.csv', header=True, columns=['predicted_age'], index=False)

user_id_test = pd.read_csv(
'data/test/clicklog_ad.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique()
ans = pd.DataFrame({'user_id': user_id_test})

gender = pd.read_csv('data/ans/transformer_gender.csv')
age = pd.read_csv('data/ans/transformer_age.csv')
ans['predicted_gender'] = gender.predicted_gender
ans['predicted_age'] = age.predicted_age
ans.to_csv('data/ans/submission.csv', header=True, index=False,
columns=['user_id', 'predicted_age', 'predicted_gender'])
# %%
# y_pred = np.where(y_pred > 0.5, 1, 0)
# y_pred = y_pred.flatten()

Expand Down
Loading

0 comments on commit e55be5c

Please sign in to comment.