-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHuntonWeek3.py
225 lines (177 loc) · 7.51 KB
/
HuntonWeek3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# -------------------------------
# DSC 530: EDA
# Week 3: Programming Assignment
# Due Date: 3/29/20
# Author: Deborah Hunton
#
# Purposes
# - Read in data from National Survey of Family Growth (FemResp and FemPreg data specifically)
# - Clean up the data (review generally to make sure accurately imported and cleaned)
# - Evaluate data:
# a. Are first babies born late? (histogram? – make sure nan variables ignored, only count live births)
# b. Are first babies heavier or lighter?
# -------------------------------
#import packages per author's code
from __future__ import print_function, division
def CleanFemPreg(df):
#This is the book author's function to clean Preg data
#mother's age is encloded in centiyears; convert to years
df.agepreg /= 100.00
#birthwgt_lb contains at least one bogus value (51 lbs)
#replace with NaN
df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
#replace 'not ascertained', 'refused', 'don't know' with NaN
na_vals = [97, 98, 99]
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
df.hpagelb.replace(na_vals, np.nan, inplace=True)
#replace other not-valid responses for sex, # living
df.babysex.replace([7,9], np.nan, inplace=True)
df.nbrnaliv.replace([9], np.nan, inplace=True)
#birthweigth is stored in 2 columns (lbs and oz)
#convert to single column in pounds
df['totalwgt_lb']=df.birthwgt_lb + df.birthwgt_oz/16.0
#due to a bug in ReadStatDct, the last variable gets clipped,
#set to NaN
df.cmintvw = np.nan
def ReadFemPreg(dct_file='2002FemPreg.dct',
dat_file='2002FemPreg.dat.gz'):
#This is the book author's function to read in Preg data
dct=thinkstats2.ReadStataDct(dct_file)
df=dct.ReadFixedWidth(dat_file, compression='gzip')
CleanFemPreg(df)
return df
def CleanFemResp(df):
pass
def ReadFemResp(dct_file='2002FemResp.dct',
dat_file='2002FemResp.dat.gz',
nrows=None):
#This is the book author's function to read in Resp data
dct = thinkstats2.ReadStataDct(dct_file)
df = dct.ReadFixedWidth(dat_file, compression='gzip',
nrows=nrows)
CleanFemResp(df)
return df
def MakePregMap(df):
#This is the book author's function
d = defaultdict(list)
for index, caseid in df.caseid.iteritems():
d[caseid].append(index)
return d
def ValidatePregnum(resp, preg):
#This is the book author's function to double-check
preg_map = MakePregMap(preg)
for index, pregnum in resp.pregnum.iteritems():
caseid = resp.caseid[index]
indices = preg_map[caseid]
if len(indices) != pregnum:
print(caseid, len(indices), pregnum)
return False
return True
def exercise_1_1(resp, preg):
#Select the birthord column, print the value counts
#compare to results in codebook
vc = preg.birthord.value_counts().sort_index()
print(vc)
#Use isnull to count the number of nans
num_nulls = preg.birthord.isnull().sum()
print("The number of nulls is: ", num_nulls)
#Select the prglngth column, print the value counts
#compare to results in codebook
vc = preg.prglngth.value_counts().sort_index()
print(vc)
#Compute the mean of birthweight in pounds
mn = preg.totalwgt_lb.mean()
print("The mean of the weights in lb is: ", mn)
#Create new column named totalwgt_kg, compute mean
preg['totalwgt_kg']=preg.totalwgt_lb / 2.2046223
mn = preg.totalwgt_kg.mean()
print("The mean of the weights in kg is: ", mn)
#Select the age_r column from resp and print value counts
#How old are oldest and youngest respondents?
vc = resp.age_r.value_counts().sort_index()
print(vc)
old = max(resp.age_r)
young = min(resp.age_r)
print("The oldest respondent was ", old, " years old.")
print("The youngest respondent was ", young, " years old.")
#How old is the respondent with caseid 1?
check_age = resp.loc[resp.caseid==1,'age_r'].tolist()
print("Respondent #1 was ", check_age[0], " years old.")
#What are the pregnancy lengths for the respondent
#with caseid 2298
pregnancy_lengths = preg.loc[(preg.caseid==2298) & (preg.outcome==1),
'prglngth'].tolist()
print("The pregnancy lengths for respondent #2298 are ",
pregnancy_lengths)
#What was the birthweight of the 1st baby born
#to the respondent with caseid 5012
first_baby_wt = preg.loc[(preg.caseid==5012) & (preg.pregordr==1)
& (preg.outcome==1), 'totalwgt_lb'].tolist()
print("The weight of the first baby for respondent #5012 was ",
first_baby_wt[0], ".")
def exercise_2_4(resp, preg):
#Using the variable totalwgt_lb, investigate whether
#first babies are lighter or heavier than others.
#All babies
preg['totalwgt_lb'].hist(bins=100)
#First babies
first_babies = preg[(preg.pregordr==1) &
(preg.outcome==1)].totalwgt_lb.values.tolist()
first_babies_wt = [float(kid) for kid in first_babies
if not(pd.isnull(kid))]
plt.hist(first_babies_wt, bins=100)
plt.show()
#All other babies
other_babies = preg[(preg.pregordr!=1) &
(preg.outcome==1)].totalwgt_lb.values.tolist()
other_babies_wt = [float(kid) for kid in other_babies
if not(pd.isnull(kid))]
plt.hist(other_babies_wt, bins=100)
plt.show()
#Compute Cohen's d to qualify the difference between
#the groups. How does it compare to the difference
#in pregnancy length?
mean1 = np.mean(first_babies_wt)
mean2 = np.mean(other_babies_wt)
var1 = np.var(first_babies_wt)
var2 = np.var(other_babies_wt)
n1 = len(first_babies_wt)
n2 = len(other_babies_wt)
cohen_d = (mean1 - mean2) / (((n1*var1) + (n2*var2)) / (n1 + n2))
print("Cohen's d for first vs other babies is: ", cohen_d)
def main():
resp=ReadFemResp()
assert(len(resp) == 7643)
assert(resp.pregnum.value_counts()[1] == 1267)
preg = ReadFemPreg()
print(preg.shape)
assert len(preg) == 13593
assert preg.caseid[13592] == 12571
assert preg.pregordr.value_counts()[1] == 5033
assert preg.nbrnaliv.value_counts()[1] == 8981
assert preg.babysex.value_counts()[1] == 4641
assert preg.birthwgt_lb.value_counts()[7] == 3049
assert preg.birthwgt_oz.value_counts()[0] == 1037
assert preg.prglngth.value_counts()[39] == 4744
assert preg.outcome.value_counts()[1] == 9148
assert preg.birthord.value_counts()[1] == 4413
assert preg.agepreg.value_counts()[22.75] == 100
assert preg.totalwgt_lb.value_counts()[7.5] == 302
weights = preg.finalwgt.value_counts()
key = max(weights.keys())
assert preg.finalwgt.value_counts()[key] == 6
assert(ValidatePregnum(resp, preg))
print('All tests passed.')
exercise_1_1(resp, preg)
exercise_2_4(resp, preg)
#import packages per author's code
import sys
import numpy as np
import thinkstats2
import matplotlib.pyplot as plt
from collections import defaultdict
#import packages I need for my code
import pandas as pd
if __name__ == "__main__":
main()