-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_access.py
189 lines (150 loc) · 5.74 KB
/
data_access.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os, sys, glob
import re
import json
import pickle
import numpy as np
### Collect data about features ###
OUTCOMES = ["gpa","grit","materialHardship","eviction","layoff","jobTraining"]
def parse_codebook_data(fname):
fNames = dict()
with open(fname, 'r') as fl:
lstate = False
for line in fl:
if line.startswith('-----'):
lstate = True
else:
if lstate:
#print line.replace(' ', '|')
while ' ' in line:
line = line.replace(' ', ' ')
#print line
temp = line.strip().split(' ')
fNames[temp[0]] = ' '.join(temp[1:])
lstate = False
return fNames
def get_feature_name_definitions():
featureNames = dict()
# Looking for a path that has all feature description files
for fname in glob.glob('data/codebooks/ff*.txt'):
featureNames.update(parse_codebook_data(fname))
return featureNames
def get_feature_list(fname='data/raw_data/background.csv'):
with open(fname, 'r') as fl:
header = set(fl.readline().strip().replace('"','').split(','))
return list(header - {'challengeID'})
# Return set of features common to all users
def get_core_feature_list():
coreFeatures = set(get_feature_list())
for uid, feat in iterate_user_background_features():
coreFeatures = coreFeatures & set(feat)
return sorted(coreFeatures)
# Get all relevant features by given keywords. For instance 'education', 'mother', etc.
def get_feature_by_keyword(keyword):
flist = list()
fDefinition = get_feature_name_definitions()
for f in get_core_feature_list():
if keyword in fDefinition.get(f,''):
flist.append(f)
return flist
def parse_feature_name(fstr):
temp = re.split('(\d+)',fstr)
return {'prefix':temp[0].split('_')[0], 'wave':int(temp[1]), 'question':''.join(temp[2:])}
# Select groups of features that represent different prefix and wave
def select_feature_by_type(features=None, prefix=None, wave=None):
if features == None:
features = get_feature_list()
selectedFeatures = list()
for f in features:
try:
ftype = parse_feature_name(f)
except:
continue
toAddP = True
if prefix <> None:
toAddP = True if ftype['prefix'] in prefix else False
toAddW = True
if wave <> None:
toAddW = True if ftype['wave'] in wave else False
if toAddP & toAddW:
selectedFeatures.append(f)
return selectedFeatures
def iterate_user_missing_features(fname='data/raw_data/background.csv'):
with open(fname, 'r') as fl:
header = fl.readline().strip().replace('"','').split(',')
#print header
for line in fl:
temp = line.strip().split(',')
ufeat = dict()
missingFeatures = list()
for i in range(1,len(header)):
try:
ufeat[header[i]] = float(temp[i])
except:
missingFeatures.append(header[i])
mfeat = dict()
for f in missingFeatures:
try:
ftype = parse_feature_name(f)
except:
continue
mf = '{}{}'.format(ftype['prefix'], ftype['wave'])
if mf not in mfeat:
mfeat[mf] = 0
mfeat[mf] += 1
yield int(temp[0]), mfeat
### LOAD DATASETS ###
def iterate_user_background_features(fname='data/raw_data/background.csv'):
with open(fname, 'r') as fl:
header = fl.readline().strip().replace('"','').split(',')
#print header
for line in fl:
temp = line.strip().split(',')
ufeat = dict()
for i in range(1,len(header)):
try:
ufeat[header[i]] = float(temp[i])
except:
pass
yield int(temp[0]), ufeat
def iterate_training_data(fname='data/raw_data/train.csv'):
with open(fname, 'r') as fl:
header = fl.readline().strip().replace('"','').split(',')
#print header
for line in fl:
temp = line.strip().split(',')
ulabels = dict()
for i in range(1,len(header)):
try:
ulabels[header[i]] = float(temp[i])
except:
pass
yield int(temp[0]), ulabels
def combine_user_feature_iterators(externalFeatures):
for uid, feat in iterate_user_background_features():
for ext in externalFeatures:
if uid in ext:
feat.update(ext[uid])
yield uid, feat
# Creates dataset for ML models
def create_data_matrix(featureList=None, outcome='gpa', userFeatureIterator=None):
if featureList == None:
featureList = sorted(get_feature_list())
if userFeatureIterator == None:
userFeatureIterator = iterate_user_background_features()
trX, teX, trY = dict(), dict(), dict()
for uid, lbls in iterate_training_data():
if outcome in lbls:
trY[uid] = lbls[outcome]
uidLbls = dict()
for uid, feat in userFeatureIterator:
if uid in trY:
trX[uid] = [feat.get(f,0) for f in featureList]
uidLbls[uid] = True
else:
teX[uid] = [feat.get(f,0) for f in featureList]
uidLbls[uid] = False
trX = np.array([trX[u] for u in sorted(trX.keys())])
teX = np.array([teX[u] for u in sorted(teX.keys())])
trY = np.array([trY[u] for u in sorted(trY.keys())])
print 'TrX: {}, TrY: {}, TeX: {}'.format(trX.shape, trY.shape, teX.shape)
return trX, trY, teX, uidLbls