hw5.py

import pathlib 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
from typing import Union 


class QuestionnaireAnalysis:
    """
    Reads and analyzes data generated by the questionnaire experiment.
    Should be able to accept strings and pathlib.Path objects.
    """

    def __init__(self, data_fname: Union[pathlib.Path, str]):
        if isinstance(data_fname, str):
            self.data_fname = pathlib.Path(data_fname)
        else:
            self.data_fname = data_fname 

        if not self.data_fname.exists():
            raise ValueError
        
    def read_data(self):
        """Reads the json data located in self.data_fname into memory, to
        the attribute self.data.
        """
        self.data = pd.read_json(self.data_fname)

    def show_age_distrib(self): #-> Tuple[np.ndarray, np.ndarray]:
        """Calculates and plots the age distribution of the participants.

    Returns
    -------
    hist : np.ndarray
    Number of people in a given bin
    bins : np.ndarray
    Bin edges
        """
        _ = self.data.hist(column = 'age',bins= np.arange(0, 100,10))
        return np.histogram(self.data['age'], range = (0,100))

    def fill_na_with_mean(self):# -> Tuple[pd.DataFrame, np.ndarray]:
        """Finds, in the original DataFrame, the subjects that didn't answer
        all questions, and replaces that missing value with the mean of the
        other grades for that student.

    Returns
    -------
    df : pd.DataFrame
        The corrected DataFrame after insertion of the mean grade
    arr : np.ndarray
            Row indices of the students that their new grades were generated
        """
        q_data = self.data[['q1','q2','q3','q4','q5']]
        
        mean_q = q_data.mean(axis=0).round(2)
        self.data[['q1','q2','q3','q4','q5']] = q_data.fillna(mean_q , axis = 0)

        indice = self.data.index[q_data.isna().any(axis=1) ==True].to_numpy() 

        return self.data ,indice

    def score_subjects(self, maximal_nans_per_sub: int = 1): #-> pd.DataFrame:
        """Calculates the average score of a subject and adds a new "score" column
        with it.

        If the subject has more than "maximal_nans_per_sub" NaN in his grades, the
        score should be NA. Otherwise, the score is simply the mean of the other grades.
        The datatype of score is UInt8, and the floating point raw numbers should be
        rounded down.

        Parameters
        ----------
        maximal_nans_per_sub : int, optional
            Number of allowed NaNs per subject before giving a NA score.

        Returns
        -------
        pd.DataFrame
            A new DF with a new column - "score".
        """
        q_data = self.data[['q1','q2','q3','q4','q5']]
        self.data['score']  = q_data.mean(axis=1)
        self.data.loc[q_data.isna().sum(axis=1 ) > maximal_nans_per_sub ,'score'] = np.nan
        self.data['score'] = np.floor(self.data['score']).astype("UInt8")
        return self.data
        
    def remove_rows_without_mail(self) -> pd.DataFrame:
        """
        Checks self.data for rows with invalid emails, and removes them.

        Returns
        -------
        df : pd.DataFrame
        A corrected DataFrame, i.e. the same table but with the erroneous rows removed and
        the (ordinal) index after a reset.
        """
        self.data['email'] = self.data['email'].apply(lambda x : self._check_email(x))
        self.data.dropna(subset = ['email'], inplace=True)
        self.data.index = range(len(self.data))
        return self.data
      
    def _check_email(self,email: str):
        if "."  not in email or not "@" in email:
            return np.nan
        start_end = email[0] + email[-1]
        if "." in start_end or "@" in start_end:
            return np.nan
        if email.count("@") !=1 or email.count(".") != 1:
            return np.nan
        if (email.find('.') - email.find('@')) < 2:
            return np.nan
        return email