Skip to content

Commit

Permalink
新的分数
Browse files Browse the repository at this point in the history
  • Loading branch information
sunlanchang committed May 29, 2020
1 parent 6732d80 commit 383deac
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 97 deletions.
149 changes: 52 additions & 97 deletions Dense_embedding_16_dim.py → Dense_embedding_128_dim.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# %%
import gc
from tensorflow.keras.utils import multi_gpu_model
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
Expand All @@ -11,37 +14,68 @@
import tensorflow as tf
from tensorflow import keras
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)
# import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import multi_gpu_model
import gc
# %%
samples = 10000
samples = 100000
columns = ['c'+str(n) for n in range(128)]
train_data = pd.read_csv('word2vec/train_data.csv',
nrows=900000, skiprows=None)
test_data = pd.read_csv('word2vec/train_data.csv',
names=columns,
skiprows=900001)

train_data = pd.read_csv('word2vec/creative_id.csv',
# nrows=900000, skiprows=None)
nrows=samples, skiprows=None)
X_train = train_data[columns].values
del train_data
gc.collect()
# %%
test_data = pd.read_csv('word2vec/creative_id.csv',
names=columns,
# skiprows=900001)
skiprows=samples)

X_test = test_data[columns].values
del test_data
gc.collect()
# %%
# 得到标签
user_train = pd.read_csv(
'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,))
Y_gender = user_train['gender'].values
Y_age = user_train['age'].values

# %%
user_id_test = pd.read_csv(
'data/test/clicklog_ad_user_test.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique()
ans = pd.DataFrame({'user_id': user_id_test})
# %%
# create the model
X_train = X_train.reshape(-1, 128, 1)
Y_gender = Y_gender[:samples] - 1
model = keras.Sequential(
[
LSTM(100),
Dense(1, activation='sigmoid')
]
)
model.compile(loss='binary_crossentropy',
optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_gender, validation_split=0.1, epochs=3, batch_size=64)
# %%
inputs = tf.random.normal([32, 128, 1])
output = model.predict(inputs)
output.shape
# %%
model.compile(loss='binary_crossentropy',
optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(
X_test, y_test), epochs=3, batch_size=64)
# %%


def create_gender_model():
def gender_dense():
model = keras.Sequential(
[
keras.Input(shape=(128,)),
Expand All @@ -63,7 +97,7 @@ def create_gender_model():
# %%


def create_age_model():
def age_dense():
model = keras.Sequential(
[
keras.Input(shape=(128,)),
Expand Down Expand Up @@ -93,7 +127,7 @@ def train_gender(X, Y, X_test, train=True, epoch=10, batch_size=1024):
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
model = create_gender_model()
model = gender_dense()
model.fit(X, Y_encoded, batch_size=batch_size, epochs=epoch)

X_test = scaler.transform(X_test)
Expand All @@ -105,7 +139,7 @@ def train_gender(X, Y, X_test, train=True, epoch=10, batch_size=1024):
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(
build_fn=create_gender_model, epochs=epoch, batch_size=batch_size, verbose=0)))
build_fn=gender_dense, epochs=epoch, batch_size=batch_size, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X, Y_encoded, cv=kfold)
Expand All @@ -130,7 +164,7 @@ def train_age(X, Y, X_test, train=True, epoch=10, batch_size=1024):
scaler.fit(X)
X = scaler.transform(X)
Y = to_categorical(Y)
model = create_age_model()
model = age_dense()
model.fit(X, Y, batch_size=batch_size, epochs=epoch)

X_test = scaler.transform(X_test)
Expand All @@ -140,11 +174,11 @@ def train_age(X, Y, X_test, train=True, epoch=10, batch_size=1024):
return y_pred_age
else:
# estimator = KerasClassifier(
# build_fn=create_gender_model, epochs=epoch, batch_size=batch_size, verbose=0)
# build_fn=gender_dense, epochs=epoch, batch_size=batch_size, verbose=0)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(
build_fn=create_age_model, epochs=epoch, batch_size=batch_size, verbose=0)))
build_fn=age_dense, epochs=epoch, batch_size=batch_size, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, X, Y, cv=kfold)
Expand All @@ -166,82 +200,3 @@ def train_age(X, Y, X_test, train=True, epoch=10, batch_size=1024):
header=True,
index=False,
)

# %%
# %%
# df_train = df_train.sort_values(
# ["user_id"], ascending=(True,))

# # %%


# def get_batch(file_name,):
# for row in open(file_name, "r"):
# yield 1


# for line in get_batch('data/train_data.csv'):
# for line in get_batch('test.py'):
# print(line)
# break
# %%
# 合成用户embedding
# path = "word2vec/wordvectors.kv"
# wv = KeyedVectors.load(path, mmap='r')
# with open('word2vec/userid_creativeids.txt', 'r')as f:
# lines = f.readlines()
# lines = [[int(e) for e in line.split(' ')] for line in lines]
# number_train_user = 900000
# number_test_user = 1000000
# user_train = lines[:number_train_user]
# user_test = lines[number_train_user:]
# columns = ['c'+str(i) for i in range(128)]
# data = {}
# for col_name in columns:
# data[col_name] = pd.Series([], dtype='float')
# df_user_train = pd.DataFrame(data)
# df_user_test = pd.DataFrame(data)
# # %%
# for line in tqdm.tqdm(user_train):
# user_embedding_train = np.zeros(128)
# for creative_id in line:
# user_embedding_train += wv[str(creative_id)]
# user_embedding_train = user_embedding_train / len(line)
# tmp = pd.DataFrame(user_embedding_train.reshape(-1,
# len(user_embedding_train)), columns=columns)
# df_user_train = df_user_train.append(tmp)
# # %%
# for line in tqdm.tqdm(user_test):
# user_embedding_test = np.zeros(128)
# for creative_id in line:
# user_embedding_test += wv[str(creative_id)]
# user_embedding_test = user_embedding_test / len(line)
# tmp = pd.DataFrame(user_embedding_test.reshape(-1,
# len(user_embedding_train)), columns=columns)
# df_user_test = df_user_test.append(tmp)
# # %%
# # 将同一个用户creative_id相加平均后即为一个用户的Embedding
# all_train_data = pd.read_csv(
# 'data/train_preliminary/clicklog_ad_user_train_eval_test.csv')
# all_train_data = all_train_data.sort_values(
# ["user_id"], ascending=(True))
# # %%
# all_test_data = pd.read_csv(
# 'data/test/clicklog_ad_user_test.csv')
# all_test_data = all_test_data.sort_values(
# ["user_id"], ascending=(True))
# # %%
# assert df_user_train.shape[0] == all_train_data.shape[0]
# df_user_train['user_id'] = all_train_data['user_id']
# df_user_train['gender'] = all_train_data['gender']
# df_user_train['age'] = all_train_data['age']
# df_user_train.to_hdf('word2vec/df_user_train_test.h5',
# key='df_user_train', mode='w')
# # %%
# assert df_user_test.shape[0] == all_test_data.shape[0]
# df_user_test['user_id'] = all_test_data['user_id']
# df_user_test.to_hdf('word2vec/df_user_train_test.h5',
# key='df_user_test', mode='a')


# %%
128 changes: 128 additions & 0 deletions word2vec_creative_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# 通过用户访问的creative_id的序列,生成每个creative_id的词嵌入
# %%
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.test.utils import datapath
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts, get_tmpfile
import pickle
from mail import mail
# %%
df_train = pd.read_csv(
'data/train_preliminary/clicklog_ad_user_train_eval_test.csv')
df_test = pd.read_csv('data/test/clicklog_ad_user_test.csv')
columns = ['user_id', 'creative_id', 'time']
frame = [df_train[columns], df_test[columns]]
df_train_test = pd.concat(frame, ignore_index=True)
df_train_test_sorted = df_train_test.sort_values(
["user_id", "time"], ascending=(True, True))
# %%
with open('word2vec/df_train_test_sorted.pkl', 'wb') as f:
pickle.dump(df_train_test_sorted, f)
# %%
with open('word2vec/df_train_test_sorted.pkl', 'rb') as f:
df_train_test_sorted = pickle.load(f)
# %%
userid_creative_ids = df_train_test_sorted.groupby(
'user_id')['creative_id'].apply(list).reset_index(name='creative_ids')
# %%
with open('word2vec/userid_creative_ids.txt', 'w')as f:
for ids in userid_creative_ids.creative_ids:
ids = [str(e) for e in ids]
line = ' '.join(ids)
f.write(line+'\n')
# %%
sentences = LineSentence('word2vec/userid_creative_ids.txt')
dimension_embedding = 128
model = Word2Vec(sentences, size=dimension_embedding,
window=10, min_count=1, workers=-1, iter=10, sg=1)
model.save("word2vec/word2vec_creative_id.model")
path = "word2vec/wordvectors_creative_id.kv"
model.wv.save(path)
print('Save embedding done!!!')
# %%
path = "word2vec/wordvectors_creative_id.kv"
wv = KeyedVectors.load(path, mmap='r')
dimension_embedding = 128
columns = ['c'+str(i) for i in range(dimension_embedding)]
data = {}
for col_name in columns:
data[col_name] = pd.Series([], dtype='float')
df_creative_id_embedding = pd.DataFrame(data)

# %%
data = {}
for key in tqdm(wv.vocab):
data[int(key)] = wv[key].tolist()
# %%
df_creative_id_embedding = pd.DataFrame.from_dict(
data, orient='index',
columns=columns)
df_creative_id_embedding['creative_id'] = df_creative_id_embedding.index
# %%
df_creative_id_embedding.to_hdf(
'word2vec/df_creative_id_embedding.h5',
key='df_creative_id_embedding', mode='w')
mail('save h5 done')
# %%
df_creative_id_embedding = pd.read_hdf(
'word2vec/df_creative_id_embedding.h5',
key='df_creative_id_embedding', mode='r')
# %%
# %%
try:
userid_creative_id_embedding = pd.merge(
df_train_test_sorted, df_creative_id_embedding, on='creative_id', how='left')
userid_creative_id_embedding.drop(
columns=['creative_id', 'time'], inplace=True)
userid_creative_id_embedding.groupby('user_id').mean().to_csv(
'word2vec/creative_id.csv', header=True, index=False)
mail('to csv done')
except:
mail('failed')
# %%
# columns = ['c'+str(i) for i in range(128)]
# data = {}
# for col_name in columns:
# data[col_name] = pd.Series([], dtype='float')
# df_user_embedding = pd.DataFrame(data)
# # %%
# # this will take 24 hours!!!
# # debug = 0
# for user in tqdm(range(len(seq_creative_id))):
# user_em = df_creative_id_embedding.loc[seq_creative_id[user]].mean()
# # df_user_embedding = df_user_embedding.append(user_em, ignore_index=True)
# debug += 1
# if debug == 10:
# break
# debug = 0
# frames = []
# for creative_id in tqdm.tqdm(wv.vocab):
# creativeid_embedding = wv[creative_id]
# tmp = pd.DataFrame(
# creativeid_embedding.reshape(-1, len(creativeid_embedding)),
# columns=columns[:-1])
# # df_creativeid_embedding = df_creativeid_embedding.append(tmp)
# frames.append(tmp)
# if len(frames) == 1000000:
# # frames = [df_creativeid_embedding, tmp]
# frames = [df_creativeid_embedding]+frames
# df_creativeid_embedding = pd.concat(frames)
# frames = []
# df_creativeid_embedding.iloc[-1, -1] = str(creative_id)
# %%
# if len(frames) != 0:
# frames = [df_creativeid_embedding]+frames
# df_creativeid_embedding = pd.concat(frames)
# df_creativeid_embedding.to_hdf('data/clicklog_ad_user_train_eval_test.h5',
# key='df_creativeid_embedding', mode='w')

# debug += 1
# if debug == 10:
# break


# %%

0 comments on commit 383deac

Please sign in to comment.