-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
135 lines (101 loc) · 3.52 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import category_encoders as ce
import warnings
import sys
import os
import logging
import mlflow
import mlflow.sklearn
import dvc.api
# log warning messages
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)
# get data url from DVC
path = 'data/car_evaluation_processed.csv'
repo = '.'
version = 'v1'
data_url = dvc.api.get_url(
path=path,
repo=repo,
rev = version
)
# define metrics for model perfromance evaluation
def model_eval(actual, pred):
"""
Calculating the accuracy score between actual and predicted values
Inp: actual, predicated values
Out: accuracy score
"""
accuracy = accuracy_score(actual, pred)
return accuracy
# set an experiment name
mlflow.set_experiment("car-evaluation")
# log parameters
def log_data_params(data_url, data_version, data):
"""
Logging data parameters to MLflow
Inp: any
Out: none
"""
mlflow.log_param("data_url", data_url)
mlflow.log_param("data_version", data_version)
mlflow.log_param("num_rows", data.shape[0])
mlflow.log_param("num_cols", data.shape[1])
# encode features
def encode_features(X_train, X_test, columns):
"""
Encoding the features with ordinal encoder
Inp: X_train, X_test, columns
Out: X_train_encoded, X_test_encoded
"""
oe = ce.OrdinalEncoder(cols=columns)
X_train_encoded = oe.fit_transform(X_train)
X_test_encoded = oe.transform(X_test)
return X_train_encoded, X_test_encoded
# execute the training pipeline
if __name__ == "__main__":
warnings.filterwarnings("ignore")
# read data from the remote repository
data = pd.read_csv(data_url, sep=",")
# initialize mlflow
with mlflow.start_run():
# log data parameters
log_data_params(data_url, version, data)
# X and y split
X = data.drop(['class'], axis=1)
y = data['class']
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# create an 'artifacts' directory
directory = "./artifacts"
if not os.path.exists(directory):
os.makedirs(directory)
# log artifacts: features
X_cols = pd.DataFrame(list(X_train.columns))
X_cols.to_csv('artifacts/features.csv', header=False, index=False)
mlflow.log_artifact('artifacts/features.csv')
# log artifacts: targets
y_cols = pd.DataFrame(list(y_train.unique()))
y_cols.to_csv('artifacts/targets.csv', header=False, index=False)
mlflow.log_artifact('artifacts/targets.csv')
# encode the features
cols = ['buying', 'meant', 'doors', 'persons', 'lug_boot', 'safety']
X_train, X_test = encode_features(X_train, X_test, cols)
# set model parameters
criterion = sys.argv[0] if len(sys.argv) > 1 else "entropy"
max_depth = float(sys.argv[1]) if len(sys.argv) > 2 else 3
# training the model
dtc = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=42)
dtc.fit(X_train, y_train)
# predicted values
y_pred = dtc.predict(X_test)
# model performance using accuracy
accuracy = model_eval(y_test, y_pred)
# log the metric
mlflow.log_metric("accuracy", accuracy)
# print("Accuracy:, accuracy)
mlflow.end_run()