Skip to content

Commit

Permalink
添加tf-idf
Browse files Browse the repository at this point in the history
  • Loading branch information
sunlanchang committed May 19, 2020
1 parent 18944b2 commit ff53f9f
Show file tree
Hide file tree
Showing 6 changed files with 808 additions and 6 deletions.
File renamed without changes.
259 changes: 259 additions & 0 deletions Dense_embedding_96_dim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
# %%
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.utils import to_categorical
# import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import multi_gpu_model
import gc
# %%
vec_dirs = ['ad_id', 'advertiser_id', 'creative_id',
'industry', 'product_category']
samples = 10000
user_train = pd.read_csv(
'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,))
train_data = pd.DataFrame({'user_id': user_train.user_id})

user_id_test = pd.read_csv(
'data/test/clicklog_ad_user_test.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique()
test_data = pd.DataFrame({'user_id': user_id_test})

for i, vec_dir in enumerate(vec_dirs):
columns = [str(n) for n in range(i*16, (i+1)*16)]
train = pd.read_csv('word2vec/'+vec_dir+'data_vec.csv',
nrows=900000, skiprows=None, names=columns)
test = pd.read_csv('word2vec/'+vec_dir+'data_vec.csv',
names=columns,
skiprows=900001)
train_data = pd.merge(train_data, train, on='user_id', how='inner')
test_data = pd.merge(test_data, test, on='user_id', how='inner')

X_train = train_data[columns].values
del train_data
gc.collect()
X_test = test_data[columns].values
del test_data
gc.collect()

Y_gender = user_train['gender'].values
Y_age = user_train['age'].values

user_id_test = pd.read_csv(
'data/test/clicklog_ad_user_test.csv').sort_values(['user_id'], ascending=(True,)).user_id.unique()
ans = pd.DataFrame({'user_id': user_id_test})
# %%


def create_gender_model():
model = keras.Sequential(
[
keras.Input(shape=(128,)),
layers.Dense(256, activation="elu"),
layers.Dense(512, activation="elu"),
# layers.Dense(1024, activation="elu"),
# layers.Dense(512, activation="elu"),
layers.Dense(256, activation="elu"),
layers.Dense(128, activation='elu'),
# layers.Dense(2, activation='softmax', name='classifier')
Dense(1, activation='sigmoid')
]
)
model.compile(loss='binary_crossentropy',
optimizer='Adam', metrics=['accuracy'])
# model.summary()
return model

# %%


def create_age_model():
model = keras.Sequential(
[
keras.Input(shape=(128,)),
layers.Dense(256, activation="elu"),
layers.Dense(512, activation="elu"),
# layers.Dense(1024, activation="elu"),
# layers.Dense(512, activation="elu"),
layers.Dense(256, activation="elu"),
layers.Dense(128, activation='elu'),
Dense(10, activation='softmax')
]
)
model.compile(loss='categorical_crossentropy',
optimizer='Adam', metrics=['accuracy'])
# model.summary()
return model

# %%


def train_gender(X, Y, X_test, train=True, epoch=10, batch_size=1024):
# 类别转换为0和1
encoder = LabelEncoder()
encoder.fit(Y)
Y_encoded = encoder.transform(Y)
if train:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
model = create_gender_model()
model.fit(X, Y_encoded, batch_size=batch_size, epochs=epoch)

X_test = scaler.transform(X_test)
y_pre = model.predict(X_test)
threshold = 0.5
y_pred_gender = np.where(y_pre > threshold, 1, 0)
return y_pred_gender
else:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(
build_fn=create_gender_model, epochs=epoch, batch_size=batch_size, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X, Y_encoded, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" %
(results.mean()*100, results.std()*100))

# parallel_model = multi_gpu_model(model, gpus=2)
# parallel_model = model
# parallel_model.fit(X, Y, epochs=10, batch_size=batch_size)
# gender_pred = parallel_model.predict(X_test, batch_size=batch_size)
# return gender_pred
# %%


def train_age(X, Y, X_test, train=True, epoch=10, batch_size=1024):
# 类别转换为0和1
encoder = LabelEncoder()
encoder.fit(Y)
Y = encoder.transform(Y)
if train:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
Y = to_categorical(Y)
model = create_age_model()
model.fit(X, Y, batch_size=batch_size, epochs=epoch)

X_test = scaler.transform(X_test)
y_pre = model.predict(X_test)
y_pred_age = np.argmax(y_pre, axis=1)

return y_pred_age
else:
# estimator = KerasClassifier(
# build_fn=create_gender_model, epochs=epoch, batch_size=batch_size, verbose=0)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(
build_fn=create_age_model, epochs=epoch, batch_size=batch_size, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" %
(results.mean()*100, results.std()*100))


# %%
y_gender = train_gender(X_train, Y_gender, X_test,
train=False, epoch=50, batch_size=4096)
y_age = train_age(X_train, Y_age, X_test,
train=False, epoch=50, batch_size=4096)
# %%
ans['predicted_age'] = y_age+1
ans['predicted_gender'] = y_gender+1
# %%
ans.to_csv('data/ans/word2vec.csv',
columns=['user_id', 'predicted_age', 'predicted_gender'],
header=True,
index=False,
)

# %%
# %%
# df_train = df_train.sort_values(
# ["user_id"], ascending=(True,))

# # %%


# def get_batch(file_name,):
# for row in open(file_name, "r"):
# yield 1


# for line in get_batch('data/train_data.csv'):
# for line in get_batch('test.py'):
# print(line)
# break
# %%
# 合成用户embedding
# path = "word2vec/wordvectors.kv"
# wv = KeyedVectors.load(path, mmap='r')
# with open('word2vec/userid_creativeids.txt', 'r')as f:
# lines = f.readlines()
# lines = [[int(e) for e in line.split(' ')] for line in lines]
# number_train_user = 900000
# number_test_user = 1000000
# user_train = lines[:number_train_user]
# user_test = lines[number_train_user:]
# columns = ['c'+str(i) for i in range(128)]
# data = {}
# for col_name in columns:
# data[col_name] = pd.Series([], dtype='float')
# df_user_train = pd.DataFrame(data)
# df_user_test = pd.DataFrame(data)
# # %%
# for line in tqdm.tqdm(user_train):
# user_embedding_train = np.zeros(128)
# for creative_id in line:
# user_embedding_train += wv[str(creative_id)]
# user_embedding_train = user_embedding_train / len(line)
# tmp = pd.DataFrame(user_embedding_train.reshape(-1,
# len(user_embedding_train)), columns=columns)
# df_user_train = df_user_train.append(tmp)
# # %%
# for line in tqdm.tqdm(user_test):
# user_embedding_test = np.zeros(128)
# for creative_id in line:
# user_embedding_test += wv[str(creative_id)]
# user_embedding_test = user_embedding_test / len(line)
# tmp = pd.DataFrame(user_embedding_test.reshape(-1,
# len(user_embedding_train)), columns=columns)
# df_user_test = df_user_test.append(tmp)
# # %%
# # 将同一个用户creative_id相加平均后即为一个用户的Embedding
# all_train_data = pd.read_csv(
# 'data/train_preliminary/clicklog_ad_user_train_eval_test.csv')
# all_train_data = all_train_data.sort_values(
# ["user_id"], ascending=(True))
# # %%
# all_test_data = pd.read_csv(
# 'data/test/clicklog_ad_user_test.csv')
# all_test_data = all_test_data.sort_values(
# ["user_id"], ascending=(True))
# # %%
# assert df_user_train.shape[0] == all_train_data.shape[0]
# df_user_train['user_id'] = all_train_data['user_id']
# df_user_train['gender'] = all_train_data['gender']
# df_user_train['age'] = all_train_data['age']
# df_user_train.to_hdf('word2vec/df_user_train_test.h5',
# key='df_user_train', mode='w')
# # %%
# assert df_user_test.shape[0] == all_test_data.shape[0]
# df_user_test['user_id'] = all_test_data['user_id']
# df_user_test.to_hdf('word2vec/df_user_train_test.h5',
# key='df_user_test', mode='a')


# %%
Loading

0 comments on commit ff53f9f

Please sign in to comment.