-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
18944b2
commit ff53f9f
Showing
6 changed files
with
808 additions
and
6 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,259 @@ | ||
# %% | ||
from tensorflow.keras.layers import Dense | ||
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier | ||
from sklearn.model_selection import cross_val_score | ||
from sklearn.preprocessing import LabelEncoder | ||
from sklearn.model_selection import StratifiedKFold | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.pipeline import Pipeline | ||
import pandas as pd | ||
import numpy as np | ||
import tensorflow as tf | ||
from tensorflow import keras | ||
from keras.utils import to_categorical | ||
# import keras | ||
from tensorflow.keras import layers | ||
from tensorflow.keras.utils import multi_gpu_model | ||
import gc | ||
# %% | ||
vec_dirs = ['ad_id', 'advertiser_id', 'creative_id', | ||
'industry', 'product_category'] | ||
samples = 10000 | ||
user_train = pd.read_csv( | ||
'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) | ||
train_data = pd.DataFrame({'user_id': user_train.user_id}) | ||
|
||
user_id_test = pd.read_csv( | ||
'data/test/clicklog_ad_user_test.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique() | ||
test_data = pd.DataFrame({'user_id': user_id_test}) | ||
|
||
for i, vec_dir in enumerate(vec_dirs): | ||
columns = [str(n) for n in range(i*16, (i+1)*16)] | ||
train = pd.read_csv('word2vec/'+vec_dir+'data_vec.csv', | ||
nrows=900000, skiprows=None, names=columns) | ||
test = pd.read_csv('word2vec/'+vec_dir+'data_vec.csv', | ||
names=columns, | ||
skiprows=900001) | ||
train_data = pd.merge(train_data, train, on='user_id', how='inner') | ||
test_data = pd.merge(test_data, test, on='user_id', how='inner') | ||
|
||
X_train = train_data[columns].values | ||
del train_data | ||
gc.collect() | ||
X_test = test_data[columns].values | ||
del test_data | ||
gc.collect() | ||
|
||
Y_gender = user_train['gender'].values | ||
Y_age = user_train['age'].values | ||
|
||
user_id_test = pd.read_csv( | ||
'data/test/clicklog_ad_user_test.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique() | ||
ans = pd.DataFrame({'user_id': user_id_test}) | ||
# %% | ||
|
||
|
||
def create_gender_model(): | ||
model = keras.Sequential( | ||
[ | ||
keras.Input(shape=(128,)), | ||
layers.Dense(256, activation="elu"), | ||
layers.Dense(512, activation="elu"), | ||
# layers.Dense(1024, activation="elu"), | ||
# layers.Dense(512, activation="elu"), | ||
layers.Dense(256, activation="elu"), | ||
layers.Dense(128, activation='elu'), | ||
# layers.Dense(2, activation='softmax', name='classifier') | ||
Dense(1, activation='sigmoid') | ||
] | ||
) | ||
model.compile(loss='binary_crossentropy', | ||
optimizer='Adam', metrics=['accuracy']) | ||
# model.summary() | ||
return model | ||
|
||
# %% | ||
|
||
|
||
def create_age_model(): | ||
model = keras.Sequential( | ||
[ | ||
keras.Input(shape=(128,)), | ||
layers.Dense(256, activation="elu"), | ||
layers.Dense(512, activation="elu"), | ||
# layers.Dense(1024, activation="elu"), | ||
# layers.Dense(512, activation="elu"), | ||
layers.Dense(256, activation="elu"), | ||
layers.Dense(128, activation='elu'), | ||
Dense(10, activation='softmax') | ||
] | ||
) | ||
model.compile(loss='categorical_crossentropy', | ||
optimizer='Adam', metrics=['accuracy']) | ||
# model.summary() | ||
return model | ||
|
||
# %% | ||
|
||
|
||
def train_gender(X, Y, X_test, train=True, epoch=10, batch_size=1024): | ||
# 类别转换为0和1 | ||
encoder = LabelEncoder() | ||
encoder.fit(Y) | ||
Y_encoded = encoder.transform(Y) | ||
if train: | ||
scaler = StandardScaler() | ||
scaler.fit(X) | ||
X = scaler.transform(X) | ||
model = create_gender_model() | ||
model.fit(X, Y_encoded, batch_size=batch_size, epochs=epoch) | ||
|
||
X_test = scaler.transform(X_test) | ||
y_pre = model.predict(X_test) | ||
threshold = 0.5 | ||
y_pred_gender = np.where(y_pre > threshold, 1, 0) | ||
return y_pred_gender | ||
else: | ||
estimators = [] | ||
estimators.append(('standardize', StandardScaler())) | ||
estimators.append(('mlp', KerasClassifier( | ||
build_fn=create_gender_model, epochs=epoch, batch_size=batch_size, verbose=0))) | ||
pipeline = Pipeline(estimators) | ||
kfold = StratifiedKFold(n_splits=5, shuffle=True) | ||
results = cross_val_score(pipeline, X, Y_encoded, cv=kfold) | ||
print("Baseline: %.2f%% (%.2f%%)" % | ||
(results.mean()*100, results.std()*100)) | ||
|
||
# parallel_model = multi_gpu_model(model, gpus=2) | ||
# parallel_model = model | ||
# parallel_model.fit(X, Y, epochs=10, batch_size=batch_size) | ||
# gender_pred = parallel_model.predict(X_test, batch_size=batch_size) | ||
# return gender_pred | ||
# %% | ||
|
||
|
||
def train_age(X, Y, X_test, train=True, epoch=10, batch_size=1024): | ||
# 类别转换为0和1 | ||
encoder = LabelEncoder() | ||
encoder.fit(Y) | ||
Y = encoder.transform(Y) | ||
if train: | ||
scaler = StandardScaler() | ||
scaler.fit(X) | ||
X = scaler.transform(X) | ||
Y = to_categorical(Y) | ||
model = create_age_model() | ||
model.fit(X, Y, batch_size=batch_size, epochs=epoch) | ||
|
||
X_test = scaler.transform(X_test) | ||
y_pre = model.predict(X_test) | ||
y_pred_age = np.argmax(y_pre, axis=1) | ||
|
||
return y_pred_age | ||
else: | ||
# estimator = KerasClassifier( | ||
# build_fn=create_gender_model, epochs=epoch, batch_size=batch_size, verbose=0) | ||
estimators = [] | ||
estimators.append(('standardize', StandardScaler())) | ||
estimators.append(('mlp', KerasClassifier( | ||
build_fn=create_age_model, epochs=epoch, batch_size=batch_size, verbose=0))) | ||
pipeline = Pipeline(estimators) | ||
kfold = StratifiedKFold(n_splits=10, shuffle=True) | ||
results = cross_val_score(pipeline, X, Y, cv=kfold) | ||
print("Baseline: %.2f%% (%.2f%%)" % | ||
(results.mean()*100, results.std()*100)) | ||
|
||
|
||
# %% | ||
y_gender = train_gender(X_train, Y_gender, X_test, | ||
train=False, epoch=50, batch_size=4096) | ||
y_age = train_age(X_train, Y_age, X_test, | ||
train=False, epoch=50, batch_size=4096) | ||
# %% | ||
ans['predicted_age'] = y_age+1 | ||
ans['predicted_gender'] = y_gender+1 | ||
# %% | ||
ans.to_csv('data/ans/word2vec.csv', | ||
columns=['user_id', 'predicted_age', 'predicted_gender'], | ||
header=True, | ||
index=False, | ||
) | ||
|
||
# %% | ||
# %% | ||
# df_train = df_train.sort_values( | ||
# ["user_id"], ascending=(True,)) | ||
|
||
# # %% | ||
|
||
|
||
# def get_batch(file_name,): | ||
# for row in open(file_name, "r"): | ||
# yield 1 | ||
|
||
|
||
# for line in get_batch('data/train_data.csv'): | ||
# for line in get_batch('test.py'): | ||
# print(line) | ||
# break | ||
# %% | ||
# 合成用户embedding | ||
# path = "word2vec/wordvectors.kv" | ||
# wv = KeyedVectors.load(path, mmap='r') | ||
# with open('word2vec/userid_creativeids.txt', 'r')as f: | ||
# lines = f.readlines() | ||
# lines = [[int(e) for e in line.split(' ')] for line in lines] | ||
# number_train_user = 900000 | ||
# number_test_user = 1000000 | ||
# user_train = lines[:number_train_user] | ||
# user_test = lines[number_train_user:] | ||
# columns = ['c'+str(i) for i in range(128)] | ||
# data = {} | ||
# for col_name in columns: | ||
# data[col_name] = pd.Series([], dtype='float') | ||
# df_user_train = pd.DataFrame(data) | ||
# df_user_test = pd.DataFrame(data) | ||
# # %% | ||
# for line in tqdm.tqdm(user_train): | ||
# user_embedding_train = np.zeros(128) | ||
# for creative_id in line: | ||
# user_embedding_train += wv[str(creative_id)] | ||
# user_embedding_train = user_embedding_train / len(line) | ||
# tmp = pd.DataFrame(user_embedding_train.reshape(-1, | ||
# len(user_embedding_train)), columns=columns) | ||
# df_user_train = df_user_train.append(tmp) | ||
# # %% | ||
# for line in tqdm.tqdm(user_test): | ||
# user_embedding_test = np.zeros(128) | ||
# for creative_id in line: | ||
# user_embedding_test += wv[str(creative_id)] | ||
# user_embedding_test = user_embedding_test / len(line) | ||
# tmp = pd.DataFrame(user_embedding_test.reshape(-1, | ||
# len(user_embedding_train)), columns=columns) | ||
# df_user_test = df_user_test.append(tmp) | ||
# # %% | ||
# # 将同一个用户creative_id相加平均后即为一个用户的Embedding | ||
# all_train_data = pd.read_csv( | ||
# 'data/train_preliminary/clicklog_ad_user_train_eval_test.csv') | ||
# all_train_data = all_train_data.sort_values( | ||
# ["user_id"], ascending=(True)) | ||
# # %% | ||
# all_test_data = pd.read_csv( | ||
# 'data/test/clicklog_ad_user_test.csv') | ||
# all_test_data = all_test_data.sort_values( | ||
# ["user_id"], ascending=(True)) | ||
# # %% | ||
# assert df_user_train.shape[0] == all_train_data.shape[0] | ||
# df_user_train['user_id'] = all_train_data['user_id'] | ||
# df_user_train['gender'] = all_train_data['gender'] | ||
# df_user_train['age'] = all_train_data['age'] | ||
# df_user_train.to_hdf('word2vec/df_user_train_test.h5', | ||
# key='df_user_train', mode='w') | ||
# # %% | ||
# assert df_user_test.shape[0] == all_test_data.shape[0] | ||
# df_user_test['user_id'] = all_test_data['user_id'] | ||
# df_user_test.to_hdf('word2vec/df_user_train_test.h5', | ||
# key='df_user_test', mode='a') | ||
|
||
|
||
# %% |
Oops, something went wrong.