-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preparation_CMAPSS.py
110 lines (90 loc) · 5.1 KB
/
data_preparation_CMAPSS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pandas as pd
import numpy as np
from sklearn import preprocessing
from pickle import dump
# Setting a seed for reproducibility
np.random.seed(1234)
# Data ingestion - reading the datasets from Azure blob
# wget.download('http://azuremlsamples.azureml.net/templatedata/PM_train.txt', out='data/')
# wget.download('http://azuremlsamples.azureml.net/templatedata/PM_test.txt', out='data/')
# wget.download('http://azuremlsamples.azureml.net/templatedata/PM_truth.txt', out='data/')
cmapss_datasets = ['FD001.txt', 'FD002.txt', 'FD003.txt', 'FD004.txt']
for cmapss in cmapss_datasets:
# read training data
train_df = pd.read_csv('data/CMAPSSData/train_'+cmapss, sep=" ", header=None)
# remove the last two columns that contains just NaN
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
sensors = [f's{j}' for j in range(1, 22)]
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3'] + sensors
# read test data
test_df = pd.read_csv('data/CMAPSSData/test_'+cmapss, sep=" ", header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)
test_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3'] + sensors
# read ground truth data
truth_df = pd.read_csv('data/CMAPSSData/RUL_'+cmapss, sep=" ", header=None)
truth_df.drop(truth_df.columns[[1]], axis=1, inplace=True)
# train_df = train_df.sort_values(['id', 'cycle'])
# train_df.head()
# Data Labeling - generate column RUL
rul = pd.DataFrame(train_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
train_df = train_df.merge(rul, on=['id'], how='left')
train_df['RUL'] = train_df['max'] - train_df['cycle']
train_df.drop('max', axis=1, inplace=True)
# Scale target variable training set
target_scaler = preprocessing.MinMaxScaler()
train_df['RUL'] = target_scaler.fit_transform(train_df['RUL'].values.reshape(-1, 1))
# generate label columns for training data
# "w1" is used for the binary classification problem: Is this engine going to fail within w1 cycles? ("label1")
# "w0" is used for Multi-class classification: Is this engine going to fail within the window [1, w0] cycles
# or to fail within the window [w0+1, w1] cycles, or it will not fail within w1 cycles?
w1 = 30
w0 = 15
train_df['label1'] = np.where(train_df['RUL'] <= w1, 1, 0)
train_df['label2'] = train_df['label1']
train_df.loc[train_df['RUL'] <= w0, 'label2'] = 2
# MinMax normalization of train data
cols_normalize = train_df.columns.difference(['id', 'cycle', 'RUL', 'label1', 'label2'])
min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(train_df[cols_normalize]),
columns=cols_normalize,
index=train_df.index)
join_df = train_df[train_df.columns.difference(cols_normalize)].join(norm_train_df)
train_df = join_df.reindex(columns=train_df.columns)
# MinMax normalization of test data
norm_test_df = pd.DataFrame(min_max_scaler.transform(test_df[cols_normalize]),
columns=cols_normalize,
index=test_df.index)
test_join_df = test_df[test_df.columns.difference(cols_normalize)].join(norm_test_df)
test_df = test_join_df.reindex(columns=test_df.columns)
test_df = test_df.reset_index(drop=True)
# generate column max for test data
rul = pd.DataFrame(test_df.groupby('id')['cycle'].max()).reset_index()
rul.columns = ['id', 'max']
truth_df.columns = ['more']
truth_df['id'] = truth_df.index + 1
truth_df['max'] = rul['max'] + truth_df['more']
truth_df.drop('more', axis=1, inplace=True)
# generate RUL for test data
test_df = test_df.merge(truth_df, on=['id'], how='left')
test_df['RUL'] = test_df['max'] - test_df['cycle']
test_df.drop('max', axis=1, inplace=True)
# Scale target variable test set
test_df['RUL'] = target_scaler.transform(test_df['RUL'].values.reshape(-1, 1))
# generate label columns w0 and w1 for test data
test_df['label1'] = np.where(test_df['RUL'] <= w1, 1, 0)
test_df['label2'] = test_df['label1']
test_df.loc[test_df['RUL'] <= w0, 'label2'] = 2
# Split test into validation and test dataset.
val_df = test_df[(test_df.id >= 1) & (test_df.id < 70)]
test_df = test_df[(test_df.id >= 70) & (test_df.id <= 100)]
# Remove sensor columns not carrying any information:
# i.e., sensors 1, 5, 10, 16, 18 and 19 (it seems it only apply in dataset FD001)
# sensors_drop = ['s1', 's5', 's6', 's10', 's16', 's18', 's19']
# train_df.drop(sensors_drop, axis=1, inplace=True)
# test_df.drop(sensors_drop, axis=1, inplace=True)
# Save scaler object for later use
dump(target_scaler, open('data/CMAPSSData/target_scaler_'+cmapss[:-4]+'.pkl', 'wb'))
train_df.to_csv('data/CMAPSSData/train_'+cmapss[:-4]+'.csv', sep=' ', float_format='%.3f')
val_df.to_csv('data/CMAPSSData/val_'+cmapss[:-4]+'.csv', sep=' ', float_format='%.3f')
test_df.to_csv('data/CMAPSSData/test_'+cmapss[:-4]+'.csv', sep=' ', float_format='%.3f')