forked from GalKepler/hw5
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhw5.py
115 lines (94 loc) · 4.03 KB
/
hw5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import Union
class QuestionnaireAnalysis:
"""
Reads and analyzes data generated by the questionnaire experiment.
Should be able to accept strings and pathlib.Path objects.
"""
def __init__(self, data_fname: Union[pathlib.Path, str]):
if isinstance(data_fname, str):
self.data_fname = pathlib.Path(data_fname)
else:
self.data_fname = data_fname
if not self.data_fname.exists():
raise ValueError
def read_data(self):
"""Reads the json data located in self.data_fname into memory, to
the attribute self.data.
"""
self.data = pd.read_json(self.data_fname)
def show_age_distrib(self): #-> Tuple[np.ndarray, np.ndarray]:
"""Calculates and plots the age distribution of the participants.
Returns
-------
hist : np.ndarray
Number of people in a given bin
bins : np.ndarray
Bin edges
"""
_ = self.data.hist(column = 'age',bins= np.arange(0, 100,10))
return np.histogram(self.data['age'], range = (0,100))
def fill_na_with_mean(self):# -> Tuple[pd.DataFrame, np.ndarray]:
"""Finds, in the original DataFrame, the subjects that didn't answer
all questions, and replaces that missing value with the mean of the
other grades for that student.
Returns
-------
df : pd.DataFrame
The corrected DataFrame after insertion of the mean grade
arr : np.ndarray
Row indices of the students that their new grades were generated
"""
q_data = self.data[['q1','q2','q3','q4','q5']]
mean_q = q_data.mean(axis=0).round(2)
self.data[['q1','q2','q3','q4','q5']] = q_data.fillna(mean_q , axis = 0)
indice = self.data.index[q_data.isna().any(axis=1) ==True].to_numpy()
return self.data ,indice
def score_subjects(self, maximal_nans_per_sub: int = 1): #-> pd.DataFrame:
"""Calculates the average score of a subject and adds a new "score" column
with it.
If the subject has more than "maximal_nans_per_sub" NaN in his grades, the
score should be NA. Otherwise, the score is simply the mean of the other grades.
The datatype of score is UInt8, and the floating point raw numbers should be
rounded down.
Parameters
----------
maximal_nans_per_sub : int, optional
Number of allowed NaNs per subject before giving a NA score.
Returns
-------
pd.DataFrame
A new DF with a new column - "score".
"""
q_data = self.data[['q1','q2','q3','q4','q5']]
self.data['score'] = q_data.mean(axis=1)
self.data.loc[q_data.isna().sum(axis=1 ) > maximal_nans_per_sub ,'score'] = np.nan
self.data['score'] = np.floor(self.data['score']).astype("UInt8")
return self.data
def remove_rows_without_mail(self) -> pd.DataFrame:
"""
Checks self.data for rows with invalid emails, and removes them.
Returns
-------
df : pd.DataFrame
A corrected DataFrame, i.e. the same table but with the erroneous rows removed and
the (ordinal) index after a reset.
"""
self.data['email'] = self.data['email'].apply(lambda x : self._check_email(x))
self.data.dropna(subset = ['email'], inplace=True)
self.data.index = range(len(self.data))
return self.data
def _check_email(self,email: str):
if "." not in email or not "@" in email:
return np.nan
start_end = email[0] + email[-1]
if "." in start_end or "@" in start_end:
return np.nan
if email.count("@") !=1 or email.count(".") != 1:
return np.nan
if (email.find('.') - email.find('@')) < 2:
return np.nan
return email