Skip to content

Commit

Permalink
Merge pull request #204 from MindSetLib/dev
Browse files Browse the repository at this point in the history
HomogeneityReport refactoring
  • Loading branch information
alexmindset authored May 22, 2024
2 parents 7fa092c + 220c8f9 commit ca4f124
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 106 deletions.
62 changes: 34 additions & 28 deletions insolver/feature_monitoring/homogeneity_report.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import os
import inspect
import numpy as np
import plotly as py
from os.path import dirname
from typing import List, Sequence, Dict, Union
from pandas import DataFrame
from plotly import express as px
from plotly.offline import plot
from plotly.graph_objects import Figure, Histogram
from plotly.figure_factory import create_distplot
from jinja2 import Environment, FileSystemLoader

Expand All @@ -14,7 +14,7 @@

def chart_cont(
x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, limits: Sequence, bins: int = 15, offline: bool = True
) -> py.graph_objs.Figure:
) -> Figure:
"""
This function draws histograms of given samples using joint grid.
It needs limits of interested area and number of bins.
Expand Down Expand Up @@ -54,24 +54,30 @@ def chart_cont(
# draw hists
hist_data = [x1_group, x2_group]
fig = create_distplot(
hist_data, group_labels, bin_size=bin_size, histnorm='probability', show_curve=False, show_rug=False
hist_data,
group_labels,
bin_size=bin_size,
histnorm='probability',
colors=['blue', 'red'],
show_curve=False,
show_rug=False,
)

# add details
fig.update_layout(
autosize=False,
width=830,
height=650,
xaxis_range=None,
legend=dict(x=0.8, y=0.95, traceorder='normal', font=dict(color='black', size=16)),
autosize=True,
width=None,
height=None,
margin=dict(l=5, r=5, t=5, b=5),
legend=dict(orientation="h", traceorder='normal'),
)
if offline:
return py.offline.plot(fig, include_plotlyjs=False, output_type='div')
return plot(fig, include_plotlyjs=False, output_type='div')
else:
return fig


def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True) -> py.graph_objs.Figure:
def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True) -> Figure:
"""
This function draws histograms of given samples using joint grid.
It needs limits of interested area and number of bins.
Expand All @@ -89,22 +95,22 @@ def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline:
"""

# draw discrete hists
fig1 = px.histogram(x1, histnorm='probability', barmode='overlay', color_discrete_sequence=['green'])
fig1.for_each_trace(lambda t: t.update(name=name1))
fig2 = px.histogram(x2, histnorm='probability', barmode='overlay', color_discrete_sequence=['red'])
fig2.for_each_trace(lambda t: t.update(name=name2))
fig = py.graph_objects.Figure(data=fig1.data + fig2.data)
fig = Figure()
fig.add_trace(Histogram(x=x1, name=name1, histnorm='probability', marker=dict(color='blue'), opacity=0.7))
fig.add_trace(Histogram(x=x2, name=name2, histnorm='probability', marker=dict(color='red'), opacity=0.7))
fig.update_xaxes(type='category')

# add details
fig.update_layout(
autosize=False,
width=830,
height=650,
legend=dict(x=0.8, y=0.95, traceorder='normal', font=dict(color='black', size=16)),
autosize=True,
width=None,
height=None,
margin=dict(l=5, r=5, t=5, b=5),
barmode='overlay',
legend=dict(orientation="h", traceorder='normal'),
)

if offline:
return py.offline.plot(fig, include_plotlyjs=False, output_type='div')
return plot(fig, include_plotlyjs=False, output_type='div')
else:
return fig

Expand Down Expand Up @@ -267,13 +273,13 @@ def build_report(
psi_bins = 20 if ('psi_bins' not in properties) else properties['psi_bins']

# manually fill nans
x1, x2, _ = fillna_cont(x1, x2, inplace=True)
x1, x2, _ = fillna_cont(x1, x2)

# run tests
homogen_tester: Union['ContinuousHomogeneityTests', 'DiscreteHomogeneityTests'] = (
ContinuousHomogeneityTests(pval_thresh, samp_size, bootstrap_num, psi_bins)
)
test_results = homogen_tester.run_all(x1, x2, inplace=True)
test_results = homogen_tester.run_all(x1, x2)

# optional drawing of charts
if draw_charts:
Expand All @@ -287,11 +293,11 @@ def build_report(

elif feat_type == 'discrete':
# manually fill nans
x1, x2, nan_value = fillna_discr(x1, x2, inplace=True)
x1, x2, nan_value = fillna_discr(x1, x2)

# run tests
homogen_tester = DiscreteHomogeneityTests(pval_thresh, samp_size, bootstrap_num)
test_results = homogen_tester.run_all(x1, x2, inplace=True)
test_results = homogen_tester.run_all(x1, x2)

# optional drawing charts
if draw_charts:
Expand Down Expand Up @@ -353,7 +359,7 @@ def render_report(report_data: list, report_path: str = 'homogeneity_report.html
curr_folder = dirname(inspect.getfile(HomogeneityReport))
template_path = curr_folder + '/' + 'report_template.html'
if not os.path.exists(template_path):
raise OSError("Can not find template file. It must be in 'feature_monitoring' package.")
raise OSError("Can not find template file. It must be in 'feature_monitoring' module.")

# error situations
for feat_report in report_data:
Expand All @@ -372,5 +378,5 @@ def render_report(report_data: list, report_path: str = 'homogeneity_report.html
template = env.get_template("report_template.html")
output = template.render(sets=report_data)

with open(report_path, 'w') as f:
with open(report_path, 'w', encoding="utf-8") as f:
f.write(output)
69 changes: 23 additions & 46 deletions insolver/feature_monitoring/homogeneity_tests.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import numpy as np
import pandas as pd
from pandas import isna
from scipy import stats as sps
from sklearn.preprocessing import LabelEncoder
from typing import Callable, List, Any
Expand Down Expand Up @@ -54,16 +54,14 @@ def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int
return pvalue


def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any:
def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray) -> Any:
"""
This function fills missing values in x1 and x2 safely for homogeneity tests.
It guarantees that missing values will be filled with unique constant.
Parameters:
x1_ref (np.array): sample from base period.
x2_ref (np.array): sample from current period.
inplace (bool): whether to modify original x1, x2
or to make copy and return it.
Returns:
x1 (np.array): sample from base period without missing values.
Expand All @@ -72,29 +70,23 @@ def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
"""

# copy inputs to avoid side effects
if inplace:
x1, x2 = x1_ref, x2_ref
else:
x1, x2 = x1_ref.copy(), x2_ref.copy()
x1, x2 = x1_ref.copy(), x2_ref.copy()

# if we have numeric data we define nan_value as (minimum - 1)
if x1.dtype == 'int':
nan_value = min(np.min(x1), np.min(x2)) - 1
return x1, x2, nan_value
if x1.dtype == 'float':
nan_value = min(np.min(x1[~pd.isna(x1)]), np.min(x2[~pd.isna(x2)])) - 1
x1[pd.isna(x1)] = nan_value
x2[pd.isna(x2)] = nan_value
nan_value = min(np.min(x1[~isna(x1)]), np.min(x2[~isna(x2)])) - 1
x1[isna(x1)] = nan_value
x2[isna(x2)] = nan_value
return x1, x2, nan_value

# if we have object data we fill nan with 'nan' str
if x1.dtype == object:
x1[pd.isna(x1)] = 'nan'
x2[pd.isna(x2)] = 'nan'
x1[isna(x1)] = 'nan'
x2[isna(x2)] = 'nan'
return x1, x2, 'nan'


def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any:
def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray) -> Any:
"""
This function fills missing values in x1 and x2 safely for homogeneity tests.
In case when nan value is just set to some constant less than all elements
Expand All @@ -103,8 +95,6 @@ def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -
Parameters:
x1_ref (np.array): sample from base period.
x2_ref (np.array): sample from current period.
inplace (bool): whether to modify original x1, x2
or to make copy and return it.
Returns:
x1 (np.array): sample from base period without missing values.
Expand All @@ -113,24 +103,21 @@ def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -
"""

# copy inputs to avoid side effects
if inplace:
x1, x2 = x1_ref, x2_ref
else:
x1, x2 = x1_ref.copy(), x2_ref.copy()
x1, x2 = x1_ref.copy(), x2_ref.copy()

# we fill nans with value less than all data
# but it is smaller than minimum on gap between minimum and second minimum
# it helps to avoid a lot of empty buckets in grids when running stat. tests
min_ = min(np.min(x1[~pd.isna(x1)]), np.min(x2[~pd.isna(x2)]))
min_ = min(np.min(x1[~isna(x1)]), np.min(x2[~isna(x2)]))

sec_min1 = sec_min(x1[~pd.isna(x1)])
sec_min2 = sec_min(x2[~pd.isna(x2)])
sec_min1 = sec_min(x1[~isna(x1)])
sec_min2 = sec_min(x2[~isna(x2)])

sec_min_ = min(sec_min1, sec_min2)

gap = sec_min_ - min_
x1[pd.isna(x1)] = min_ - gap
x2[pd.isna(x2)] = min_ - gap
x1[isna(x1)] = min_ - gap
x2[isna(x2)] = min_ - gap

return x1, x2, min_ - gap

Expand Down Expand Up @@ -169,15 +156,13 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int):
self.samp_size = samp_size
self.bootstrap_num = bootstrap_num

def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List:
def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray) -> List:
"""
Runs all discrete tests for two samples: 'chi2', 'psi'.
Parameters:
x1_ref (np.array): sample from base period.
x2_ref (np.array): sample from current period.
inplace (bool): whether to modify original x1, x2
or to make copy and return it.
Returns:
res (list of tuples): contains tuples of 3 elemets.
Expand Down Expand Up @@ -225,14 +210,11 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
)

# copy inputs to avoid side effects
if inplace:
x1, x2 = x1_ref, x2_ref
else:
x1, x2 = x1_ref.copy(), x2_ref.copy()
x1, x2 = x1_ref.copy(), x2_ref.copy()

# fill nan values with special method to avoid collisions of category labels
if np.any(pd.isna(x1)) or np.any(pd.isna(x2)):
x1, x2, _ = fillna_discr(x1, x2, inplace=True)
if np.any(isna(x1)) or np.any(isna(x2)):
x1, x2, _ = fillna_discr(x1, x2)

# encode categorical data with integer nums
enc = LabelEncoder()
Expand Down Expand Up @@ -301,15 +283,13 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int, psi_b
self.bootstrap_num = bootstrap_num
self.psi_bins = psi_bins

def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List:
def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray) -> List:
"""
Runs all continuous tests for two samples: 'ks', 'cr-vonmis', 'epps-sing', 'psi'.
Parameters:
x1_ref (np.array): sample from base period.
x2_ref (np.array): sample from current period.
inplace (bool): whether to modify original x1, x2
or to make copy and work with it.
Returns:
res (list of tuples): contains tuples of 3 elements.
Expand Down Expand Up @@ -357,14 +337,11 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
)

# copy inputs to avoid side effects
if inplace:
x1, x2 = x1_ref, x2_ref
else:
x1, x2 = x1_ref.copy(), x2_ref.copy()
x1, x2 = x1_ref.copy(), x2_ref.copy()

# fill nan values with special method; usual 'fillna' don't fully suit homogeneity tests
if np.any(pd.isna(x1)) or np.any(pd.isna(x2)):
x1, x2, nan_value = fillna_cont(x1, x2, inplace=True)
if np.any(isna(x1)) or np.any(isna(x2)):
x1, x2, nan_value = fillna_cont(x1, x2)
else:
# this value will indicate psi that there are no nans
nan_value = min(np.min(x1), np.min(x2)) - 1
Expand Down
31 changes: 18 additions & 13 deletions insolver/feature_monitoring/report_template.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,18 @@


<style>

body {
body {
background: #f3f3f3;
}

.table td.red {
background: rgb(218, 177, 182, .25);
}

.table td.green {
background: rgba(189, 222, 200, .25);
}

.table td.neutral {
background: rgba(221, 222, 189, .25);
}
Expand All @@ -33,11 +32,11 @@
text-align: center;
}

.graph-text{
.graph-text {
margin-top: 30px;
font-weight: bold;
}

.table {
width: 100%;
border: none;
Expand Down Expand Up @@ -80,7 +79,7 @@
.table tbody tr td:last-child {
border-radius: 0 8px 8px 0;
}

.graph {
border: 1px dotted;
}
Expand All @@ -107,13 +106,13 @@
<h1>Homogeneity Report</h1>
</header>

<main>
{% for set in sets %}
<main>
{% for set in sets %}
<div>
<p class="graph-text lead text-center">{{set[0]}}</p>
<div class="d-md-flex flex-md-equal w-100 my-md-3 ps-md-3">
<div class="graph me-md-3 pt-3 px-3 pt-md-2 px-md-2 text-center overflow-hidden">
{{set[3]}}
{{set[3]|safe}}
</div>
<div class="stats bg-white me-md-3 pt-3 px-3 pt-md-5 px-md-5 text-center overflow-hidden">
<table class="table">
Expand Down Expand Up @@ -153,8 +152,14 @@ <h1>Homogeneity Report</h1>
</div>
</div>
</div>
{% endfor %}
{% endfor %}
</main>

<script>
document.addEventListener("DOMContentLoaded", function () {
const buttons = document.querySelectorAll('[data-title="Autoscale"]');
buttons.forEach(button => button.click());
});
</script>
</body>

</html>
Loading

0 comments on commit ca4f124

Please sign in to comment.