-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathtrain_models.py
119 lines (89 loc) · 3.29 KB
/
train_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from app.RedditClassifier import RedditClassifier
import logging
import dill
import pandas as pd
from utils.ml_utils import CleanTextTransformer, SpacyTokenTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Source: https://github.com/SeldonIO/seldon-core/tree/master/examples/models/sklearn_spacy_text
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - [%(filename)s:%(lineno)d] - %(levelname)s - %(message)s'
)
def train():
df_cols = ["prev_idx", "parent_idx", "body", "removed"]
TEXT_COLUMN = "body"
CLEAN_COLUMN = "clean_body"
TOKEN_COLUMN = "token_body"
# Downloading the 50k reddit dataset of moderated comments
# Link to dataset
# https://raw.githubusercontent.com/axsauze/reddit-classification-exploration/master/data/reddit_train.csv
logging.info("Read reddit training dataset.")
df = pd.read_csv(
"data/reddit_train.csv",
names=df_cols,
skiprows=1,
encoding="ISO-8859-1",
)
df.head()
x = df["body"].values
y = df["removed"].values
logging.info("Train test split.")
x_train, x_test, y_train, y_test = train_test_split(
x, y, stratify=y, random_state=42, test_size=0.1, shuffle=True
)
# Clean the text
logging.info("Clean the text.")
clean_text_transformer = CleanTextTransformer()
x_train_clean = clean_text_transformer.transform(x_train)
# Tokenize the text and get the lemmas
logging.info("Tokenize the text and get the lemmas.")
spacy_tokenizer = SpacyTokenTransformer()
x_train_tokenized = spacy_tokenizer.transform(x_train_clean)
# Build tfidf vectorizer
logging.info("Build tfidf vectorizer.")
tfidf_vectorizer = TfidfVectorizer(
max_features=10000,
preprocessor=lambda x: x,
tokenizer=lambda x: x,
token_pattern=None,
ngram_range=(1, 3),
)
tfidf_vectorizer.fit(x_train_tokenized)
# Transform our tokens to tfidf vectors
logging.info("Transform our tokens to tfidf vectors.")
x_train_tfidf = tfidf_vectorizer.transform(x_train_tokenized)
# Train logistic regression classifier
logging.info("Train logistic regression classifier.")
lr = LogisticRegression(C=0.1, solver="sag")
lr.fit(x_train_tfidf, y_train)
# These are the models we'll deploy
logging.info("Dump models.")
with open("models/tfidf_vectorizer.model", "wb") as model_file:
dill.dump(tfidf_vectorizer, model_file)
with open("models/lr.model", "wb") as model_file:
dill.dump(lr, model_file)
logging.info("Finished training.")
train()
logging.info("Test model prediction.")
classifier = RedditClassifier(models_dir="models")
df_cols = ["prev_idx", "parent_idx", "body", "removed"]
df = pd.read_csv(
"data/reddit_train.csv",
names=df_cols,
skiprows=1,
encoding="ISO-8859-1",
)
df.head()
x = df["body"].values
y = df["removed"].values
logging.info("Train test split.")
x_train, x_test, y_train, y_test = train_test_split(
x, y, stratify=y, random_state=42, test_size=0.1, shuffle=True
)
# With one sample
sample = x_test[0:1]
logging.info(sample)
logging.info(classifier.predict(sample, ["feature_name"]))
logging.info("Finished testing.")