From 6732d8080f3f44e1f14d63b09b29b5b21a21b524 Mon Sep 17 00:00:00 2001 From: sunlanchang Date: Fri, 29 May 2020 15:17:21 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E7=9A=84=E5=88=86=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- stacking.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index eb06de1..b463309 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ - [ ] +RNN等序列模型 - [ ] +LightGBM - [ ] TF-IDF - - [ ] +LightGBM (**accuracy: 1.26**) + - [ ] min_df=30 + LightGBM (**accuracy: 1.30**) - [ ] +Dense - [ ] DeepFM、DeepFFM等 - [ ] 集成学习:比赛最后阶段使用上分 diff --git a/stacking.py b/stacking.py index e69de29..bbf0b74 100644 --- a/stacking.py +++ b/stacking.py @@ -0,0 +1,56 @@ +# %% +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +import pandas as pd +import numpy as np +import lightgbm as lgb +from mail import mail +# %% +user = pd.read_csv( + 'data/train_preliminary/user.csv').sort_values(['user_id'], ascending=(True,)) +Y_train_gender = user.gender +Y_train_age = user.age +corpus = [] +f = open('word2vec/userid_creativeids.txt', 'r') +# train_examples = 100 +# test_examples = 200 +# train_test = 300 +train_test = 1900000 +train_examples = 900000 +test_examples = 1000000 +flag = 0 +for row in f: + # row = [[int(e) for e in seq] for seq in row.strip().split(' ')] + row = row.strip() + corpus.append(row) + flag += 1 + if flag == train_test: + break +# %% +Y_train_gender = Y_train_gender.iloc[:train_examples]-1 +Y_train_age = Y_train_age.iloc[:train_examples]-1 +# %% +min_df = 30 +max_df = 0.001 +vectorizer = TfidfVectorizer( + token_pattern=r"(?u)\b\w+\b", + min_df=min_df, + # max_df=max_df, + # max_features=128, + dtype=np.float32, +) +all_data = vectorizer.fit_transform(corpus) +print('(examples, features)', all_data.shape) +print('train tfidf done! min_df={}, max_df={} shape is {}'.format( + min_df, max_df, all_data.shape[1])) +mail('train tfidf done! min_df={}, max_df={} shape is {}'.format( + min_df, max_df, all_data.shape[1])) +# %% +train_val = all_data[:train_examples, :] +# %% +X_test = all_data[train_examples:(train_examples+test_examples), :] +# %% +test_user_id = pd.read_csv( + 'data/test/click_log.csv').sort_values(['user_id'], ascending=(True)).user_id.unique() +# %% +test_user_id = test_user_id[:test_examples]