diff --git a/README.md b/README.md index b572470..f4025d2 100644 --- a/README.md +++ b/README.md @@ -10,15 +10,18 @@ - [ ] +LightGBM - [ ] +RNN - [x] 处理成序列问题后,使用word2vec生成词嵌入后 + - [x] +LightGBM (accuraty: 0.8) - [x] 3个特征+Dense (**accuracy: 1.05**) - - [ ] 6个特征+Dense (accuracy: ) + - [x] 6个128特征+Conv 1D (**accuracy: 1.15**) + - [x] 6个128特征+Dense (**accuracy: 1.20**) - [ ] +RNN等序列模型 - - [x] +LightGBM (accuraty: 0.8) - [ ] GNN生成user_id creative_id ad_id等的词嵌入后分类 - [ ] +Dense - [ ] +RNN等序列模型 - [ ] +LightGBM - [ ] TF-IDF + - [ ] +LightGBM + - [ ] +Dense - [ ] DeepFM、DeepFFM等 - [ ] 集成学习:比赛最后阶段使用上分 diff --git a/word2vec.py b/word2vec.py index 386bab5..1f93871 100644 --- a/word2vec.py +++ b/word2vec.py @@ -38,7 +38,7 @@ sentences = LineSentence('word2vec/userid_creativeids.txt') dimension_embedding = 128 model = Word2Vec(sentences, size=dimension_embedding, - window=3, min_count=1, workers=-1) + window=10, min_count=1, workers=-1) model.save("word2vec/word2vec.model") path = "word2vec/wordvectors.kv" model.wv.save(path) @@ -71,10 +71,10 @@ key='df_creativeid_embedding', mode='r') # %% # 不需要读出list -with open('word2vec/userid_creativeids.txt', 'r')as f: - seq_creative_id = f.readlines() -seq_creative_id = [[str(e) for e in line.strip().split(' ')] - for line in seq_creative_id] +# with open('word2vec/userid_creativeids.txt', 'r')as f: +# seq_creative_id = f.readlines() +# seq_creative_id = [[str(e) for e in line.strip().split(' ')] +# for line in seq_creative_id] # %% userid_creativeid_embedding = pd.merge( @@ -95,13 +95,13 @@ df_user_embedding = pd.DataFrame(data) # %% # this will take 24 hours!!! -# debug = 0 -for user in tqdm(range(len(seq_creative_id))): - user_em = df_creativeid_embedding.loc[seq_creative_id[user]].mean() - # df_user_embedding = df_user_embedding.append(user_em, ignore_index=True) - # debug += 1 - # if debug == 10: - # break +# # debug = 0 +# for user in tqdm(range(len(seq_creative_id))): +# user_em = df_creativeid_embedding.loc[seq_creative_id[user]].mean() +# df_user_embedding = df_user_embedding.append(user_em, ignore_index=True) +# debug += 1 +# if debug == 10: +# break # debug = 0 # frames = [] # for creative_id in tqdm.tqdm(wv.vocab):