Skip to content

Commit

Permalink
script for data preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
OL-YAD committed Sep 20, 2024
1 parent 7b3c726 commit 6bc5cd0
Showing 1 changed file with 39 additions and 0 deletions.
39 changes: 39 additions & 0 deletions scripts/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



# calculate percentage of missing values
def calculate_missing_percentage(dataframe):
# Determine the total number of elements in the DataFrame
total_elements = np.prod(dataframe.shape)

# Count the number of missing values in each column
missing_values = dataframe.isna().sum()

# Sum the total number of missing values
total_missing = missing_values.sum()

# Compute the percentage of missing values
percentage_missing = (total_missing / total_elements) * 100

# Print the result, rounded to two decimal places
print(f"The dataset has {round(percentage_missing, 2)}% missing values.")


def check_missing_values(df):
"""Check for missing values in the dataset."""
missing_values = df.isnull().sum()
missing_percentages = 100 * df.isnull().sum() / len(df)
column_data_types = df.dtypes
missing_table = pd.concat([missing_values, missing_percentages, column_data_types], axis=1, keys=['Missing Values', '% of Total Values','Data type'])
return missing_table.sort_values('% of Total Values', ascending=False).round(2)

def outlier_box_plots(df):
for column in df:
plt.figure(figsize=(10, 5))
sns.boxplot(x=df[column])
plt.title(f'Box plot of {column}')
plt.show()

0 comments on commit 6bc5cd0

Please sign in to comment.