From 350caf4ef6d5c449473cc3a8bdddb81fa45ba9c1 Mon Sep 17 00:00:00 2001 From: Felix Wick Date: Sat, 28 Oct 2023 21:09:01 +0200 Subject: [PATCH] fix for potential empty bins in multi-dimensional features --- cyclic_boosting/generic_loss.py | 17 ++++++++++++----- pyproject.toml | 2 +- tests/test_integration.py | 20 ++++++++++---------- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/cyclic_boosting/generic_loss.py b/cyclic_boosting/generic_loss.py index a81fc5c..c850288 100644 --- a/cyclic_boosting/generic_loss.py +++ b/cyclic_boosting/generic_loss.py @@ -67,11 +67,18 @@ def calc_parameters( """ sorting = feature.lex_binned_data.argsort() sorted_bins = feature.lex_binned_data[sorting] - splits_indices = np.unique(sorted_bins, return_index=True)[1][1:] + bins, split_indices = np.unique(sorted_bins, return_index=True) + split_indices = split_indices[1:] y_pred = np.hstack((y[..., np.newaxis], self.unlink_func(pred.predict_link())[..., np.newaxis])) y_pred = np.hstack((y_pred, self.weights[..., np.newaxis])) - y_pred_bins = np.split(y_pred[sorting], splits_indices) + y_pred_bins = np.split(y_pred[sorting], split_indices) + + # keep potential empty bins in multi-dimensional features + all_bins = range(max(feature.lex_binned_data) + 1) + empty_bins = list(set(bins) ^ set(all_bins)) + for i in empty_bins: + y_pred_bins.insert(i, np.zeros((0, 3))) n_bins = len(y_pred_bins) parameters = np.zeros(n_bins) @@ -380,14 +387,14 @@ def quantile_costs(prediction: np.ndarray, y: np.ndarray, weights: np.ndarray, q float calcualted quantile costs """ - if not len(y) > 0: - raise ValueError("Loss cannot be computed on empty data") - else: + if len(y) > 0: sum_weighted_error = np.nansum( ((y < prediction) * (1 - quantile) * (prediction - y) + (y >= prediction) * quantile * (y - prediction)) * weights ) return sum_weighted_error / np.nansum(weights) + else: + return 0 def quantile_global_scale( diff --git a/pyproject.toml b/pyproject.toml index 613a18a..12cc8f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "cyclic-boosting" -version = "1.2.0" +version = "1.2.1" description = "Implementation of Cyclic Boosting machine learning algorithms" authors = ["Blue Yonder GmbH"] packages = [{include = "cyclic_boosting"}] diff --git a/tests/test_integration.py b/tests/test_integration.py index 8c892cc..8e258ce 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -489,6 +489,9 @@ def test_multiplicative_quantile_regression_90(is_plot, prepare_data, features, def test_multiplicative_quantile_regression_pdf_J_QPD_S(is_plot, prepare_data, features, feature_properties): X, y = prepare_data + # empty bin check + X["P_ID"].iloc[1] = 20 + quantiles = [] quantile_values = [] for quantile in [0.2, 0.5, 0.8]: @@ -513,23 +516,20 @@ def test_multiplicative_quantile_regression_pdf_J_QPD_S(is_plot, prepare_data, f np.testing.assert_almost_equal(j_qpd_s.ppf(0.2), quantile_values[0, i], 3) np.testing.assert_almost_equal(j_qpd_s.ppf(0.5), quantile_values[1, i], 3) np.testing.assert_almost_equal(j_qpd_s.ppf(0.8), quantile_values[2, i], 3) - if i == 24: - np.testing.assert_almost_equal(j_qpd_s.ppf(0.1), 0.457, 3) - np.testing.assert_almost_equal(j_qpd_s.ppf(0.9), 5.509, 3) - if is_plot: + if is_plot: + cdf_truth = smear_discrete_cdftruth(j_qpd_s.cdf, y[i]) + cdf_truth_list.append(cdf_truth) + + if i == 24: plt.plot([0.2, 0.5, 0.8], [quantile_values[0, i], quantile_values[1, i], quantile_values[2, i]], "ro") xs = np.linspace(0.0, 1.0, 100) plt.plot(xs, j_qpd_s.ppf(xs)) plt.savefig("J_QPD_S_integration_" + str(i) + ".png") plt.clf() - if is_plot: - cdf_truth = smear_discrete_cdftruth(j_qpd_s.cdf, y[i]) - cdf_truth_list.append(cdf_truth) - - cdf_truth = np.asarray(cdf_truth_list) if is_plot: + cdf_truth = np.asarray(cdf_truth_list) plt.hist(cdf_truth[cdf_truth > 0], bins=30) plt.savefig("J_QPD_S_cdf_truth_histo.png") plt.clf() @@ -555,7 +555,7 @@ def test_multiplicative_quantile_regression_spline(is_plot, prepare_data, featur i = 24 spl_fit = quantile_fit_spline(quantiles, quantile_values[:, i]) - np.testing.assert_almost_equal(spl_fit(0.2), 0.527, 3) + np.testing.assert_almost_equal(spl_fit(0.2), 0.529, 3) np.testing.assert_almost_equal(spl_fit(0.5), 2.193, 3) np.testing.assert_almost_equal(spl_fit(0.8), 4.21, 3)