Merge pull request #204 from MindSetLib/dev

HomogeneityReport refactoring
MindSetLib · May 22, 2024 · ca4f124 · ca4f124
2 parents 7fa092c + 220c8f9
commit ca4f124
Show file tree

Hide file tree

Showing 6 changed files with 153 additions and 106 deletions.
diff --git a/insolver/feature_monitoring/homogeneity_report.py b/insolver/feature_monitoring/homogeneity_report.py
@@ -1,11 +1,11 @@
 import os
 import inspect
 import numpy as np
-import plotly as py
 from os.path import dirname
 from typing import List, Sequence, Dict, Union
 from pandas import DataFrame
-from plotly import express as px
+from plotly.offline import plot
+from plotly.graph_objects import Figure, Histogram
 from plotly.figure_factory import create_distplot
 from jinja2 import Environment, FileSystemLoader
 
@@ -14,7 +14,7 @@
 
 def chart_cont(
     x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, limits: Sequence, bins: int = 15, offline: bool = True
-) -> py.graph_objs.Figure:
+) -> Figure:
     """
     This function draws histograms of given samples using joint grid.
     It needs limits of interested area and number of bins.
@@ -54,24 +54,30 @@ def chart_cont(
     # draw hists
     hist_data = [x1_group, x2_group]
     fig = create_distplot(
-        hist_data, group_labels, bin_size=bin_size, histnorm='probability', show_curve=False, show_rug=False
+        hist_data,
+        group_labels,
+        bin_size=bin_size,
+        histnorm='probability',
+        colors=['blue', 'red'],
+        show_curve=False,
+        show_rug=False,
     )
 
     # add details
     fig.update_layout(
-        autosize=False,
-        width=830,
-        height=650,
-        xaxis_range=None,
-        legend=dict(x=0.8, y=0.95, traceorder='normal', font=dict(color='black', size=16)),
+        autosize=True,
+        width=None,
+        height=None,
+        margin=dict(l=5, r=5, t=5, b=5),
+        legend=dict(orientation="h", traceorder='normal'),
     )
     if offline:
-        return py.offline.plot(fig, include_plotlyjs=False, output_type='div')
+        return plot(fig, include_plotlyjs=False, output_type='div')
     else:
         return fig
 
 
-def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True) -> py.graph_objs.Figure:
+def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline: bool = True) -> Figure:
     """
     This function draws histograms of given samples using joint grid.
     It needs limits of interested area and number of bins.
@@ -89,22 +95,22 @@ def chart_discr(x1: np.ndarray, x2: np.ndarray, name1: str, name2: str, offline:
     """
 
     # draw discrete hists
-    fig1 = px.histogram(x1, histnorm='probability', barmode='overlay', color_discrete_sequence=['green'])
-    fig1.for_each_trace(lambda t: t.update(name=name1))
-    fig2 = px.histogram(x2, histnorm='probability', barmode='overlay', color_discrete_sequence=['red'])
-    fig2.for_each_trace(lambda t: t.update(name=name2))
-    fig = py.graph_objects.Figure(data=fig1.data + fig2.data)
+    fig = Figure()
+    fig.add_trace(Histogram(x=x1, name=name1, histnorm='probability', marker=dict(color='blue'), opacity=0.7))
+    fig.add_trace(Histogram(x=x2, name=name2, histnorm='probability', marker=dict(color='red'), opacity=0.7))
+    fig.update_xaxes(type='category')
 
     # add details
     fig.update_layout(
-        autosize=False,
-        width=830,
-        height=650,
-        legend=dict(x=0.8, y=0.95, traceorder='normal', font=dict(color='black', size=16)),
+        autosize=True,
+        width=None,
+        height=None,
+        margin=dict(l=5, r=5, t=5, b=5),
+        barmode='overlay',
+        legend=dict(orientation="h", traceorder='normal'),
     )
-
     if offline:
-        return py.offline.plot(fig, include_plotlyjs=False, output_type='div')
+        return plot(fig, include_plotlyjs=False, output_type='div')
     else:
         return fig
 
@@ -267,13 +273,13 @@ def build_report(
                 psi_bins = 20 if ('psi_bins' not in properties) else properties['psi_bins']
 
                 # manually fill nans
-                x1, x2, _ = fillna_cont(x1, x2, inplace=True)
+                x1, x2, _ = fillna_cont(x1, x2)
 
                 # run tests
                 homogen_tester: Union['ContinuousHomogeneityTests', 'DiscreteHomogeneityTests'] = (
                     ContinuousHomogeneityTests(pval_thresh, samp_size, bootstrap_num, psi_bins)
                 )
-                test_results = homogen_tester.run_all(x1, x2, inplace=True)
+                test_results = homogen_tester.run_all(x1, x2)
 
                 # optional drawing of charts
                 if draw_charts:
@@ -287,11 +293,11 @@ def build_report(
 
             elif feat_type == 'discrete':
                 # manually fill nans
-                x1, x2, nan_value = fillna_discr(x1, x2, inplace=True)
+                x1, x2, nan_value = fillna_discr(x1, x2)
 
                 # run tests
                 homogen_tester = DiscreteHomogeneityTests(pval_thresh, samp_size, bootstrap_num)
-                test_results = homogen_tester.run_all(x1, x2, inplace=True)
+                test_results = homogen_tester.run_all(x1, x2)
 
                 # optional drawing charts
                 if draw_charts:
@@ -353,7 +359,7 @@ def render_report(report_data: list, report_path: str = 'homogeneity_report.html
     curr_folder = dirname(inspect.getfile(HomogeneityReport))
     template_path = curr_folder + '/' + 'report_template.html'
     if not os.path.exists(template_path):
-        raise OSError("Can not find template file. It must be in 'feature_monitoring' package.")
+        raise OSError("Can not find template file. It must be in 'feature_monitoring' module.")
 
     # error situations
     for feat_report in report_data:
@@ -372,5 +378,5 @@ def render_report(report_data: list, report_path: str = 'homogeneity_report.html
     template = env.get_template("report_template.html")
     output = template.render(sets=report_data)
 
-    with open(report_path, 'w') as f:
+    with open(report_path, 'w', encoding="utf-8") as f:
         f.write(output)
diff --git a/insolver/feature_monitoring/homogeneity_tests.py b/insolver/feature_monitoring/homogeneity_tests.py
@@ -1,5 +1,5 @@
 import numpy as np
-import pandas as pd
+from pandas import isna
 from scipy import stats as sps
 from sklearn.preprocessing import LabelEncoder
 from typing import Callable, List, Any
@@ -54,16 +54,14 @@ def bootstrap(x1: np.ndarray, x2: np.ndarray, bootstrap_num: int, samp_size: int
     return pvalue
 
 
-def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any:
+def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray) -> Any:
     """
     This function fills missing values in x1 and x2 safely for homogeneity tests.
     It guarantees that missing values will be filled with unique constant.
 
     Parameters:
         x1_ref (np.array): sample from base period.
         x2_ref (np.array): sample from current period.
-        inplace (bool): whether to modify original x1, x2
-        or to make copy and return it.
 
     Returns:
         x1 (np.array): sample from base period without missing values.
@@ -72,29 +70,23 @@ def fillna_discr(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
     """
 
     # copy inputs to avoid side effects
-    if inplace:
-        x1, x2 = x1_ref, x2_ref
-    else:
-        x1, x2 = x1_ref.copy(), x2_ref.copy()
+    x1, x2 = x1_ref.copy(), x2_ref.copy()
 
     # if we have numeric data we define nan_value as (minimum - 1)
-    if x1.dtype == 'int':
-        nan_value = min(np.min(x1), np.min(x2)) - 1
-        return x1, x2, nan_value
     if x1.dtype == 'float':
-        nan_value = min(np.min(x1[~pd.isna(x1)]), np.min(x2[~pd.isna(x2)])) - 1
-        x1[pd.isna(x1)] = nan_value
-        x2[pd.isna(x2)] = nan_value
+        nan_value = min(np.min(x1[~isna(x1)]), np.min(x2[~isna(x2)])) - 1
+        x1[isna(x1)] = nan_value
+        x2[isna(x2)] = nan_value
         return x1, x2, nan_value
 
     # if we have object data we fill nan with 'nan' str
     if x1.dtype == object:
-        x1[pd.isna(x1)] = 'nan'
-        x2[pd.isna(x2)] = 'nan'
+        x1[isna(x1)] = 'nan'
+        x2[isna(x2)] = 'nan'
         return x1, x2, 'nan'
 
 
-def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> Any:
+def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray) -> Any:
     """
     This function fills missing values in x1 and x2 safely for homogeneity tests.
     In case when nan value is just set to some constant less than all elements
@@ -103,8 +95,6 @@ def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -
     Parameters:
         x1_ref (np.array): sample from base period.
         x2_ref (np.array): sample from current period.
-        inplace (bool): whether to modify original x1, x2
-        or to make copy and return it.
 
     Returns:
         x1 (np.array): sample from base period without missing values.
@@ -113,24 +103,21 @@ def fillna_cont(x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -
     """
 
     # copy inputs to avoid side effects
-    if inplace:
-        x1, x2 = x1_ref, x2_ref
-    else:
-        x1, x2 = x1_ref.copy(), x2_ref.copy()
+    x1, x2 = x1_ref.copy(), x2_ref.copy()
 
     # we fill nans with value less than all data
     # but it is smaller than minimum on gap between minimum and second minimum
     # it helps to avoid a lot of empty buckets in grids when running stat. tests
-    min_ = min(np.min(x1[~pd.isna(x1)]), np.min(x2[~pd.isna(x2)]))
+    min_ = min(np.min(x1[~isna(x1)]), np.min(x2[~isna(x2)]))
 
-    sec_min1 = sec_min(x1[~pd.isna(x1)])
-    sec_min2 = sec_min(x2[~pd.isna(x2)])
+    sec_min1 = sec_min(x1[~isna(x1)])
+    sec_min2 = sec_min(x2[~isna(x2)])
 
     sec_min_ = min(sec_min1, sec_min2)
 
     gap = sec_min_ - min_
-    x1[pd.isna(x1)] = min_ - gap
-    x2[pd.isna(x2)] = min_ - gap
+    x1[isna(x1)] = min_ - gap
+    x2[isna(x2)] = min_ - gap
 
     return x1, x2, min_ - gap
 
@@ -169,15 +156,13 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int):
         self.samp_size = samp_size
         self.bootstrap_num = bootstrap_num
 
-    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List:
+    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray) -> List:
         """
         Runs all discrete tests for two samples: 'chi2', 'psi'.
 
         Parameters:
             x1_ref (np.array): sample from base period.
             x2_ref (np.array): sample from current period.
-            inplace (bool): whether to modify original x1, x2
-            or to make copy and return it.
 
         Returns:
             res (list of tuples): contains tuples of 3 elemets.
@@ -225,14 +210,11 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
             )
 
         # copy inputs to avoid side effects
-        if inplace:
-            x1, x2 = x1_ref, x2_ref
-        else:
-            x1, x2 = x1_ref.copy(), x2_ref.copy()
+        x1, x2 = x1_ref.copy(), x2_ref.copy()
 
         # fill nan values with special method to avoid collisions of category labels
-        if np.any(pd.isna(x1)) or np.any(pd.isna(x2)):
-            x1, x2, _ = fillna_discr(x1, x2, inplace=True)
+        if np.any(isna(x1)) or np.any(isna(x2)):
+            x1, x2, _ = fillna_discr(x1, x2)
 
         # encode categorical data with integer nums
         enc = LabelEncoder()
@@ -301,15 +283,13 @@ def __init__(self, pval_thresh: float, samp_size: int, bootstrap_num: int, psi_b
         self.bootstrap_num = bootstrap_num
         self.psi_bins = psi_bins
 
-    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False) -> List:
+    def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray) -> List:
         """
         Runs all continuous tests for two samples: 'ks', 'cr-vonmis', 'epps-sing', 'psi'.
 
         Parameters:
             x1_ref (np.array): sample from base period.
             x2_ref (np.array): sample from current period.
-            inplace (bool): whether to modify original x1, x2
-            or to make copy and work with it.
 
         Returns:
             res (list of tuples): contains tuples of 3 elements.
@@ -357,14 +337,11 @@ def run_all(self, x1_ref: np.ndarray, x2_ref: np.ndarray, inplace: bool = False)
             )
 
         # copy inputs to avoid side effects
-        if inplace:
-            x1, x2 = x1_ref, x2_ref
-        else:
-            x1, x2 = x1_ref.copy(), x2_ref.copy()
+        x1, x2 = x1_ref.copy(), x2_ref.copy()
 
         # fill nan values with special method; usual 'fillna' don't fully suit homogeneity tests
-        if np.any(pd.isna(x1)) or np.any(pd.isna(x2)):
-            x1, x2, nan_value = fillna_cont(x1, x2, inplace=True)
+        if np.any(isna(x1)) or np.any(isna(x2)):
+            x1, x2, nan_value = fillna_cont(x1, x2)
         else:
             # this value will indicate psi that there are no nans
             nan_value = min(np.min(x1), np.min(x2)) - 1

diff --git a/insolver/feature_monitoring/report_template.html b/insolver/feature_monitoring/report_template.html
@@ -12,19 +12,18 @@
 
 
     <style>
-
-       body {
+        body {
             background: #f3f3f3;
         }
-        
+
         .table td.red {
             background: rgb(218, 177, 182, .25);
         }
-        
+
         .table td.green {
             background: rgba(189, 222, 200, .25);
         }
-        
+
         .table td.neutral {
             background: rgba(221, 222, 189, .25);
         }
@@ -33,11 +32,11 @@
             text-align: center;
         }
 
-        .graph-text{
+        .graph-text {
             margin-top: 30px;
             font-weight: bold;
         }
-        
+
         .table {
             width: 100%;
             border: none;
@@ -80,7 +79,7 @@
         .table tbody tr td:last-child {
             border-radius: 0 8px 8px 0;
         }
-        
+
         .graph {
             border: 1px dotted;
         }
@@ -107,13 +106,13 @@
         <h1>Homogeneity Report</h1>
     </header>
 
-<main>
-{% for set in sets %}
+    <main>
+        {% for set in sets %}
         <div>
             <p class="graph-text lead text-center">{{set[0]}}</p>
             <div class="d-md-flex flex-md-equal w-100 my-md-3 ps-md-3">
                 <div class="graph me-md-3 pt-3 px-3 pt-md-2 px-md-2 text-center overflow-hidden">
-                    {{set[3]}}
+                    {{set[3]|safe}}
                 </div>
                 <div class="stats bg-white me-md-3 pt-3 px-3 pt-md-5 px-md-5 text-center overflow-hidden">
                     <table class="table">
@@ -153,8 +152,14 @@ <h1>Homogeneity Report</h1>
                 </div>
             </div>
         </div>
-{% endfor %}
+        {% endfor %}
     </main>
-
+    <script>
+        document.addEventListener("DOMContentLoaded", function () {
+            const buttons = document.querySelectorAll('[data-title="Autoscale"]');
+            buttons.forEach(button => button.click());
+        });
+    </script>
 </body>
+
 </html>