Skip to content

Commit

Permalink
Merge pull request #6 from Selling-Pandas/plots_patterns
Browse files Browse the repository at this point in the history
Plots_patterns
  • Loading branch information
Vex1cK authored Apr 27, 2024
2 parents d52d362 + a357c2f commit 06d3d56
Show file tree
Hide file tree
Showing 14 changed files with 3,715 additions and 1,117 deletions.
4 changes: 3 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ max-line-length = 89
import-style-order = google
inline-quotes = double

ignore=T201
ignore=
T201
exclude=test.py
Binary file not shown.
Binary file added fonts/HSESans-Black.otf
Binary file not shown.
Binary file added fonts/HSESans-Bold.otf
Binary file not shown.
Binary file added fonts/HSESans-Italic.otf
Binary file not shown.
Binary file added fonts/HSESans-Regular.otf
Binary file not shown.
Binary file added fonts/HSESans-SemiBold.otf
Binary file not shown.
Binary file added fonts/HSESans-Thin.otf
Binary file not shown.
63 changes: 63 additions & 0 deletions spandas/plots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import matplotlib.pyplot as plt
from matplotlib import font_manager
from pandas import DataFrame
from spandas.utils import is_float

from typing import Tuple


def subplots(nplots: int, figsize: Tuple[int, int] = ()):
font_dirs = ['fonts']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
font_manager.fontManager.addfont(font_file)

plt.rcParams["font.family"] = "HSE Sans"
plt.rcParams["font.size"] = 32
# сменить цвет графика на тёмнно-синий
plt.rcParams["axes.prop_cycle"] = plt.cycler(color=["#01287a"])
n = (nplots // 2) + nplots % 2
if not figsize:
figsize = (20, 10 * n)
fig, axes = plt.subplots(
nrows=n,
ncols=2,
figsize=figsize
)
return fig, axes


def print_distributions(
df: DataFrame, cols: dict, figsize: tuple[int, int] = (30, 30), bins: int = 100
) -> tuple[bool, str]:
try:
_, axes = subplots(
len(cols),
figsize=figsize,
)
i, j, max_i = (
0,
0,
(len(cols) // 2) + (1 if len(cols) % 2 > 0 else 0),
)
for col in cols:
col_of_nums = df[col].apply(
lambda x: (-1000 if (not is_float(str(x)) or x != x) else float(x))
) # x != x only when x is NaN
axes[i, j].hist(col_of_nums)
axes[i, j].set_xlabel(f"Значение переменной {col}")
axes[i, j].set_ylabel("Частота")
axes[i, j].set_title(f"График распределения переменной {col}")
# axes[i, j].set_xticks(range())
# axes[i, j].set_xlim(min(-1000, col_of_nums.min()), col_of_nums.max())
# axes[i, j].set_ylim(0, 1500)
i += 1
if i == max_i:
j += 1
i = 0
plt.tight_layout()
plt.show()
except Exception as ex:
return False, str(ex)
return True, ""
73 changes: 16 additions & 57 deletions spandas/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# stdlib

from dataclasses import dataclass

# third party
from art import tprint

import matplotlib.pyplot as plt
from spandas.plots import print_distributions
from spandas.utils import is_float

import pandas as pd

Expand Down Expand Up @@ -68,7 +68,7 @@ def get_list_of_cols(
if df[col].dtype in ["float64", "int64"]:
result.append(col)
continue
str_count = sum(list(map(lambda x: not str(x).isdigit(), df[col].unique())))
str_count = sum(list(map(lambda x: not is_float(str(x)), df[col].unique())))
if str_count <= n:
result.append(col)
return result
Expand Down Expand Up @@ -97,24 +97,23 @@ def mark_outliers(series: pd.Series, method: int = 1):
А если он False, то столбцы со строками вообще не должны выбираться, поэтому сюда
строковые значения не попадут
"""
if series.dtype not in ["float64", "int64"]:
series_without_strings = series[
series.apply(lambda x: str(x).isdigit())
].astype("float64")
series_without_strings = series.copy(deep=True)
if series.dtype not in ['float64', 'int64']:
series_without_strings = series[series.apply(lambda x: is_float(str(x)))] \
.astype("float64")
if method == 1:
q1 = series_without_strings.quantile(0.25)
q3 = series_without_strings.quantile(0.75)
iqr = q3 - q1
lower_fence = q1 - 1.5 * iqr
upper_fence = q3 + 1.5 * iqr
return ~series.apply(
lambda x: (
lower_fence <= float(x) <= upper_fence if str(x).isdigit() else True
)
)
return ~series.apply(lambda x: (lower_fence <= float(x) <= upper_fence
if is_float(str(x)) else True))
if method == 2:
# here will be some method
pass
q05 = series_without_strings.quantile(0.05)
q95 = series_without_strings.quantile(0.95)
return ~series.apply(lambda x: (q05 <= float(x) <= q95
if is_float(str(x)) else True))


def remove_outliers_from_series(series: pd.Series, method: int = 1):
Expand Down Expand Up @@ -148,46 +147,6 @@ def remove_outliers(df: pd.DataFrame, columns: list[str] = [], method: int = 1):
return df[~sum(mark_outliers(df[co], method=method) for co in columns).astype(bool)]


def print_distr(
df: pd.DataFrame, cols: dict, figsize: tuple[int, int] = (30, 30), bins: int = 100
) -> tuple[bool, str]:
"""
Дим, эта функция (или целый блок, их бы в отдельный подмодуль вынести)
Пока что она вызывается автоматически в data_preprocessing в конце, в was-became
logging части
Параметры там, всё остальное, всё поменяешь, пока-что док не нужен т.к. вызывается
автоматически
"""
try:
_, axes = plt.subplots(
(len(cols) // 2) + (1 if len(cols) % 2 > 0 else 0),
2,
figsize=figsize,
)
i, j, max_i = (
0,
0,
(len(cols) // 2) + (1 if len(cols) % 2 > 0 else 0),
)
for col in cols:
col_of_nums = df[col].apply(
lambda x: (-1000 if (not str(x).isdigit() or x != x) else float(x))
) # x != x only when x is NaN
axes[i, j].hist(col_of_nums, bins=bins)
axes[i, j].set_xlabel(f"Значение переменной {col}")
axes[i, j].set_ylabel("Частота")
axes[i, j].set_title(f"График распределения переменной {col}")
i += 1
if i == max_i:
j += 1
i = 0
plt.tight_layout()
plt.show()
except Exception as ex:
return False, str(ex)
return True, ""


def data_preprocessing(
df: pd.DataFrame,
n: int = 2,
Expand Down Expand Up @@ -264,10 +223,10 @@ def data_preprocessing(
clear_df = clear_df[~marks]
if logging.was_became:
tprint("WAS:")
print_distr(df, cols, (10, 13))
print_distributions(df, cols)
tprint("__________")
tprint("BECAME:")
print_distr(clear_df, cols, (10, 13))
print_distributions(clear_df, cols)
return clear_df, deleted


Expand Down
5 changes: 5 additions & 0 deletions spandas/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
def is_float(string):
if string.replace(".", "").replace(',', '').isnumeric():
return True
else:
return False
Loading

0 comments on commit 06d3d56

Please sign in to comment.