diff --git a/.gitignore b/.gitignore index 66b8a9b4a..662a52dc9 100644 --- a/.gitignore +++ b/.gitignore @@ -455,3 +455,6 @@ dask-worker-space/ *.pub *.rdp *_rsa + +# Others +src/main.*.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 41fb21f5e..ae3da4e92 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -344,6 +344,7 @@ endif(USE_CUDA) ) add_executable(lightgbm src/main.cpp src/application/application.cpp ${SOURCES}) +#add_executable(fairgbm_multiple_runs src/main.multiple_runs.cpp src/application/application.cpp ${SOURCES}) list(APPEND SOURCES "src/c_api.cpp") # Only build the R part of the library if building for diff --git a/LICENSE b/LICENSE index 5ae193c94..25df6b8c2 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,213 @@ + + **For commercial uses of FairGBM please contact .** + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2022 Feedzai + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + + The MIT License (MIT) Copyright (c) Microsoft Corporation diff --git a/include/LightGBM/boosting.h b/include/LightGBM/boosting.h index ddbcdbc18..ffdc84bfc 100644 --- a/include/LightGBM/boosting.h +++ b/include/LightGBM/boosting.h @@ -84,6 +84,14 @@ class LIGHTGBM_EXPORT Boosting { */ virtual bool TrainOneIter(const score_t* gradients, const score_t* hessians) = 0; + /*! + * \brief Gradient ascent step w.r.t. Lagrangian multipliers (used only for constrained optimization) + * \param gradients nullptr for using default objective, otherwise use self-defined boosting + * \param hessians nullptr for using default objective, otherwise use self-defined boosting + * \return True if cannot train anymore (or training has ended due to early stopping) + */ + virtual bool TrainLagrangianOneIter(const score_t* gradients, const score_t* hessians) = 0; + /*! * \brief Rollback one iteration */ diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 5142604ca..b4ee397c6 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -7,6 +7,11 @@ * - nested sections can be placed only at the bottom of parent's section; * - [doc-only] tag indicates that only documentation for this param should be generated and all other actions are performed manually; * - [no-save] tag indicates that this param should not be saved into a model text representation. + * + * **NOTES!** + * - configs for use with the LightGBM::Application, to be passed via command line arguments or argv. + * - can also be filled by means of a config file, passed as config= in this config. + * - see the parameters in this Class to see what configs are available! */ #ifndef LIGHTGBM_CONFIG_H_ #define LIGHTGBM_CONFIG_H_ @@ -186,6 +191,20 @@ struct Config { // desc = in ``dart``, it also affects on normalization weights of dropped trees double learning_rate = 0.1; + // alias = multiplier_shrinkage_rate, lagrangian_learning_rate, lagrangian_multiplier_learning_rate + // check = >0.0 + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = learning rate for the Lagrangian multipliers (which enforce the constraints) + double multiplier_learning_rate = 0.1; + + // alias = lagrangian_multipliers, init_multipliers + // type = multi-double + // default = 0,0,...,0 + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = list representing the magnitude of *initial* (first iteration only) penalties for each constraint + // desc = list should have the same number of elements as the number of constraints + std::vector init_lagrangian_multipliers; + // default = 31 // alias = num_leaf, max_leaves, max_leaf // check = >1 @@ -803,6 +822,24 @@ struct Config { #pragma endregion + // alias = output_dir + // type = string + // default = "." + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = output dir of gradients and hessians per iteration + // desc = **Note**: can be used only in CLI version + std::string debugging_output_dir = "."; + + // type = int or string + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = used to specify the Protected Attribute id column + // desc = use number for index, e.g. ``constraint_group=0`` means column\_0 is the query id + // desc = add a prefix ``name:`` for column name, e.g. ``constraint_group=name:id`` + // desc = **Note**: works only in case of loading data directly from file + // desc = **Note**: index starts from ``0`` and it doesn't count the label column when passing type is ``int``, e.g. when label is column\_0 and query\_id is column\_1, the correct parameter is ``query=0`` + // desc = **Note**: group membership values will take type `ushort`, hence keep all values below the maximum according to your compilation settings + std::string constraint_group_column = ""; + #pragma endregion #pragma region Objective Parameters @@ -885,6 +922,107 @@ struct Config { // desc = separate by ``,`` std::vector label_gain; + // type = string + // default = None + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = type of group-wise constraint to enforce during training + // desc = can take values "fpr", "fnr", or "fpr,fnr" + std::string constraint_type; + + // alias = constraint_proxy_function, constraint_stepwise_proxy_function + // type = string + // default = "cross_entropy" + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = type of proxy function to use in group-wise constraints + // desc = this will be used as a differentiable proxy for the stepwise function in the gradient descent step + // desc = can take values "hinge", "quadratic", or "cross_entropy" + std::string constraint_stepwise_proxy = "cross_entropy"; + + // alias = objective_proxy_function, objective_stepwise_proxy_function + // type = string + // default = None + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = type of proxy function to use as the proxy objective + // desc = only used when optimizing for functions with a stepwise (e.g., FNR, FPR) + std::string objective_stepwise_proxy = ""; + + // alias = proxy_margin + // check = >0 + // type = double + // default = 1.0 + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = for `ConstrainedCrossEntropy`: the value of the function at x=0; f(0)=stepwise_proxy_margin; (vertical margin) + // desc = for other constrained objectives: the horizontal margin of the function; i.e., for stepwise_proxy_margin=1, the proxy function will be 0 until x=-1 for FPR and non-zero onwards, or non-zero until x=1 for FNR, and non-zero onwards; + // desc = **TODO**: set all functions to use this value as the vertical margin + double stepwise_proxy_margin = 1.0; + + // alias = constraint_fpr_slack, constraint_fpr_delta + // check = >=0 + // check = <1.0 + // type = double + // default = 0 + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = the slack when fulfilling group-wise FPR constraints + // desc = when using the value 0.0 this will enforce group-wise FPR to be *exactly* equal + double constraint_fpr_threshold = 0.0; + + // alias = constraint_fnr_slack, constraint_fnr_delta + // check = >=0 + // check = <1.0 + // type = double + // default = 0 + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = the slack when fulfilling group-wise FNR constraints + // desc = when using the value 0.0 this will enforce group-wise FNR to be *exactly* equal + double constraint_fnr_threshold = 0.0; + + // check = >=0 + // check = <1.0 + // type = double + // default = 0.5 + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = score threshold used for computing the GROUP-WISE confusion matrices + // desc = used to compute violation of group-wise constraints during training + double score_threshold = 0.5; + + // type = string + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = type of GLOBAL constraint to enforce during training + // desc = can take values "fpr", "fnr", or "fpr,fnr" + // desc = must be paired with the arguments "global_target_" accordingly + std::string global_constraint_type; + + // alias = global_fpr, target_global_fpr + // check = >=0 + // check = <=1.0 + // type = double + // default = 1.0 + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = target rate for the global FPR (inequality) constraint + // desc = constraint is fulfilled with global_fpr <= global_target_fpr + // desc = the default value of 1 means that this constraint is always fulfilled (never active) + double global_target_fpr = 1.; + + // alias = global_fnr, target_global_fnr + // check = >=0 + // check = <=1.0 + // type = double + // default = 1.0 + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = target rate for the global FNR (inequality) constraint + // desc = constraint is fulfilled with global_fnr <= global_target_fnr + // desc = the default value of 1 means that this constraint is always fulfilled (never active) + double global_target_fnr = 1.; + + // check = >=0 + // check = <1.0 + // type = double + // default = 0.5 + // desc = used only for constrained optimization (ignored for standard LightGBM) + // desc = score threshold for computing the GLOBAL confusion matrix + // desc = used to compute violation of GLOBAL constraints during training + double global_score_threshold = 0.5; + #pragma endregion #pragma region Metric Parameters diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index d2f48ef15..a8a0d323f 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -37,6 +37,7 @@ class DatasetLoader; * 4. Query Weights, auto calculate by weights and query_boundaries(if both of them are existed) * the weight for i-th query is sum(query_boundaries[i] , .., query_boundaries[i+1]) / (query_boundaries[i + 1] - query_boundaries[i+1]) * 5. Initial score. optional. if existing, the model will boost from this score, otherwise will start from 0. +* 6. [FairGBM-only] Group, used for training during constrained optimization. */ class Metadata { public: @@ -69,8 +70,9 @@ class Metadata { * \param num_data Number of training data * \param weight_idx Index of weight column, < 0 means doesn't exists * \param query_idx Index of query id column, < 0 means doesn't exists + * \param constraint_group_idx_ Index of group constraint id column, < 0 means it doesn't exist */ - void Init(data_size_t num_data, int weight_idx, int query_idx); + void Init(data_size_t num_data, int weight_idx, int query_idx, int constraint_group_idx_); /*! * \brief Partition label by used indices @@ -92,6 +94,13 @@ class Metadata { void SetQuery(const data_size_t* query, data_size_t len); + /*! + * \brief Set constraint group information in bulk (for the whole train dataset) + * \param constraint_group constraint group information for each instance. + * \param len the number of elements in the constraint_group array. + */ + void SetConstraintGroup(const float* constraint_group, data_size_t len); + /*! * \brief Set initial scores * \param init_score Initial scores, this class will manage memory for init_score. @@ -158,7 +167,7 @@ class Metadata { /*! * \brief Get data boundaries on queries, if not exists, will return nullptr * we assume data will order by query, - * the interval of [query_boundaris[i], query_boundaris[i+1]) + * the interval of [query_boundaries[i], query_boundaries[i+1]) * is the data indices for query i. * \return Pointer of data boundaries on queries */ @@ -200,6 +209,31 @@ class Metadata { } } + /*! + * \brief Set constraint group value for one record + * \param idx Index of this record + * \param value Group constraint value of this record + */ + inline void SetConstraintGroupAt(data_size_t idx, constraint_group_t value) { + constraint_group_[idx] = value; + } + + /*! + * \brief Get pointer of group + * \return Pointer of group + */ + inline const constraint_group_t* group() const { return constraint_group_.data(); } + + /*! \brief Get unique groups in data */ + inline std::vector group_values() const { + std::vector values(constraint_group_); + std::sort(values.begin(), values.end()); + + auto last = std::unique(values.begin(), values.end()); + values.erase(last, values.end()); + return values; + } + /*! * \brief Get size of initial scores */ @@ -246,6 +280,9 @@ class Metadata { bool weight_load_from_file_; bool query_load_from_file_; bool init_score_load_from_file_; + + /*! \brief Group data for group constraints */ + std::vector constraint_group_; }; diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index e72dd4910..88cd58437 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -84,6 +84,9 @@ class DatasetLoader { std::unordered_set categorical_features_; /*! \brief Whether to store raw feature values */ bool store_raw_; + + /*! \brief index of constraint group column */ + int constraint_group_idx_; }; } // namespace LightGBM diff --git a/include/LightGBM/meta.h b/include/LightGBM/meta.h index 3452f28d8..831873257 100644 --- a/include/LightGBM/meta.h +++ b/include/LightGBM/meta.h @@ -47,6 +47,13 @@ typedef double label_t; typedef float label_t; #endif +/*! \brief Type of metadata, include group */ +#ifdef GROUP_T_USE_INT + typedef u_int constraint_group_t; +#else + typedef u_short constraint_group_t; +#endif + const score_t kMinScore = -std::numeric_limits::infinity(); const score_t kEpsilon = 1e-15f; diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index 5ea838dec..c483742a8 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,8 @@ class ObjectiveFunction { virtual bool IsConstantHessian() const { return false; } + virtual bool IsConstrained() const { return false; } + virtual bool IsRenewTreeOutput() const { return false; } virtual double RenewTreeOutput(double ori_output, std::function, @@ -58,6 +61,8 @@ class ObjectiveFunction { virtual int NumPredictOneRow() const { return 1; } + virtual int NumConstraints() const { return 0; } + /*! \brief The prediction should be accurate or not. True will disable early stopping for prediction. */ virtual bool NeedAccuratePrediction() const { return true; } @@ -90,6 +95,1001 @@ class ObjectiveFunction { LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& str); }; -} // namespace LightGBM +class ConstrainedObjectiveFunction : public ObjectiveFunction +{ +public: + /*! \brief virtual destructor */ + virtual ~ConstrainedObjectiveFunction() {} + + void SetUpFromConfig(const Config &config) + { + constraint_type = config.constraint_type; + + // Normalize constraint type + std::transform(constraint_type.begin(), constraint_type.end(), constraint_type.begin(), ::toupper); + if (constraint_type == "FNR,FPR") + constraint_type = "FPR,FNR"; + + fpr_threshold_ = (score_t)config.constraint_fpr_threshold; + fnr_threshold_ = (score_t)config.constraint_fnr_threshold; + score_threshold_ = (score_t)config.score_threshold; + proxy_margin_ = (score_t)config.stepwise_proxy_margin; + + /** Global constraint parameters **/ + global_constraint_type = config.global_constraint_type; + + // Normalize global constraint type + std::transform(global_constraint_type.begin(), global_constraint_type.end(), global_constraint_type.begin(), ::toupper); + if (global_constraint_type == "FNR,FPR") + global_constraint_type = "FPR,FNR"; + + global_target_fpr_ = (score_t)config.global_target_fpr; + global_target_fnr_ = (score_t)config.global_target_fnr; + global_score_threshold_ = (score_t)config.global_score_threshold; + + // Function used as a PROXY for step-wise in the CONSTRAINTS + constraint_stepwise_proxy = ValidateProxyFunctionName(config.constraint_stepwise_proxy, false); + + // Function used as a PROXY for the step-wise in the OBJECTIVE + objective_stepwise_proxy = ValidateProxyFunctionName(config.objective_stepwise_proxy, true); + + // Debug configs + debugging_output_dir_ = config.debugging_output_dir; + } + + /*! + * \brief Initialize + * \param metadata Label data + * \param num_data Number of data + */ + void Init(const Metadata &metadata, data_size_t num_data) override + { + num_data_ = num_data; + label_ = metadata.label(); + weights_ = metadata.weights(); + + // Store Information about the group + group_ = metadata.group(); + group_values_ = metadata.group_values(); + + // Store Information about the labels + total_label_positives_ = 0; + total_label_negatives_ = 0; + ComputeLabelCounts(); + + CHECK_NOTNULL(label_); + Common::CheckElementsIntervalClosed(label_, 0.0f, 1.0f, num_data_, GetName()); + Log::Info("[%s:%s]: (objective) labels passed interval [0, 1] check", GetName(), __func__); + + if (weights_ != nullptr) + { + label_t minw; + double sumw; + Common::ObtainMinMaxSum(weights_, num_data_, &minw, static_cast(nullptr), &sumw); + if (minw < 0.0f) + { + Log::Fatal("[%s]: at least one weight is negative", GetName()); + } + if (sumw < DBL_MIN) + { + Log::Fatal("[%s]: sum of weights is zero", GetName()); + } + } + } + + /** + * Template method for computing an instance's predictive loss value + * from its predicted score (log-odds). + * + * @param label Instance label. + * @param score Instance predicted score (log-odds); + * @return The instance loss value. + */ + virtual double ComputePredictiveLoss(label_t label, double score) const = 0; + + /*! + * \brief Get functions w.r.t. to the lagrangian multipliers. + * \brief This includes the evaluation of both the objective + * \brief function (aka the loss) and also the (real) constraints. + * \brief Therefore, the returned array will be of size. + * \brief NumConstraints + 1 (plus one from the loss lagrang. multiplier). + * \param score prediction score in this round. + */ + virtual std::vector GetLagrangianGradientsWRTMultipliers(const double *score) const + { + if (weights_ != nullptr) + throw std::logic_error("not implemented yet"); // TODO: https://github.com/feedzai/fairgbm/issues/5 + + std::vector constraint_values; + std::unordered_map group_fpr, group_fnr; + + // NOTE! ** MULTIPLIERS ARE ORDERED! ** + // - 1st: group-wise FPR constraints (one multiplier per group) + // - 2nd: group-wise FNR constraints (one multiplier per group) + // - 3rd: global FPR constraint (a single multiplier) + // - 4th: global FNR constraint (a single multiplier) + + // Multiplier corresponding to group-wise FPR constraints + if (IsFPRConstrained()) + { + ComputeFPR(score, score_threshold_, group_fpr); + double max_fpr = Constrained::findMaxValuePair(group_fpr).second; + + // Assuming group_values_ is in ascending order + for (const auto &group : group_values_) + { + double fpr_constraint_value = max_fpr - group_fpr[group] - fpr_threshold_; + constraint_values.push_back(fpr_constraint_value); + +#ifdef DEBUG + Log::Debug( + "DEBUG; true FPR constraint value: c = %.3f - %.3f = %.3f\n", + max_fpr, group_fpr[group], fpr_constraint_value); +#endif + } + } + + // Multiplier corresponding to group-wise FNR constraints + if (IsFNRConstrained()) + { + ComputeFNR(score, score_threshold_, group_fnr); + double max_fnr = Constrained::findMaxValuePair(group_fnr).second; + + // Assuming group_values_ is in ascending order + for (const auto &group : group_values_) + { + double fnr_constraint_value = max_fnr - group_fnr[group] - fnr_threshold_; + constraint_values.push_back(fnr_constraint_value); + +#ifdef DEBUG + Log::Debug( + "DEBUG; true FNR constraint value: c = %.3f - %.3f = %.3f\n", + max_fnr, group_fnr[group], fnr_constraint_value); +#endif + } + } + + // Next multiplier will correspond to the global FPR constraint + if (IsGlobalFPRConstrained()) + { + double global_fpr = ComputeGlobalFPR(score, global_score_threshold_); + double global_fpr_constraint_value = global_fpr - global_target_fpr_; + + constraint_values.push_back(global_fpr_constraint_value); + +#ifdef DEBUG + Log::Debug( + "DEBUG; true global FPR constraint value: c = %.3f - %.3f = %.3f\n", + global_fpr, global_target_fpr_, global_fpr_constraint_value); +#endif + } + + // Next multiplier will correspond to the global FNR constraint + if (IsGlobalFNRConstrained()) + { + double global_fnr = ComputeGlobalFNR(score, global_score_threshold_); + double global_fnr_constraint_value = global_fnr - global_target_fnr_; + + constraint_values.push_back(global_fnr_constraint_value); + +#ifdef DEBUG + Log::Debug( + "DEBUG; true global FNR constraint value: c = %.3f - %.3f = %.3f\n", + global_fnr, global_target_fnr_, global_fnr_constraint_value); +#endif + } + +#ifdef DEBUG + Constrained::write_values(debugging_output_dir_, "constraint_values.dat", constraint_values); +#endif + + return constraint_values; + } + + /*! + * \brief Get gradients of the constraints w.r.t. to the scores (this will use proxy constraints!). + * \param double Lagrangian multipliers in this round + * \param score prediction score in this round + * \gradients Output gradients + * \hessians Output hessians + */ + virtual void GetConstraintGradientsWRTModelOutput(const double *lagrangian_multipliers, + const double *score, score_t *gradients, + score_t * /* hessians */) const + { + + std::unordered_map group_fpr, group_fnr; + std::pair max_proxy_fpr, max_proxy_fnr; + + // Helper constant for BCE-based proxies + double xent_horizontal_shift = log(exp(proxy_margin_) - 1); // here, proxy_margin_ is the VERTICAL margin + + /** ---------------------------------------------------------------- * + * FPR (Proxy) Constraint + * ---------------------------------------------------------------- * + * It corresponds to the result of differentiating the FPR proxy + * constraint w.r.t. the score of the ensemble. + * + * FPR Proxy Constraints: + * lambda_group_i * [max(FPR_group_1, ..., FPR_group_j) - FPR_group_i] + * + * ---------------------------------------------------------------- * + * To compute it, we need to: + * 1. Compute FPR by group + * 2. Determine the group with max(FPR) + * 3. Compute derivative w.r.t. all groups except max(FPR) + * ---------------------------------------------------------------- * + * */ + if (IsFPRConstrained()) + { + if (constraint_stepwise_proxy == "hinge") + ComputeHingeFPR(score, group_fpr); + else if (constraint_stepwise_proxy == "quadratic") + ComputeQuadraticLossFPR(score, group_fpr); + else if (constraint_stepwise_proxy == "cross_entropy") + ComputeXEntropyLossFPR(score, group_fpr); + else + throw std::invalid_argument("constraint_stepwise_proxy=" + constraint_stepwise_proxy + " not implemented!"); + + max_proxy_fpr = Constrained::findMaxValuePair(group_fpr); + } + if (IsFNRConstrained()) + { + if (constraint_stepwise_proxy == "hinge") + ComputeHingeLossFNR(score, group_fnr); + else if (constraint_stepwise_proxy == "quadratic") + ComputeQuadraticLossFNR(score, group_fnr); + else if (constraint_stepwise_proxy == "cross_entropy") + ComputeXEntropyLossFNR(score, group_fnr); + else + throw std::invalid_argument("constraint_stepwise_proxy=" + constraint_stepwise_proxy + " not implemented!"); + + max_proxy_fnr = Constrained::findMaxValuePair(group_fnr); + } + + /** ---------------------------------------------------------------- * + * GRADIENTS (per instance) * + * ---------------------------------------------------------------- */ + if (weights_ != nullptr) + { + throw std::logic_error("not implemented yet"); // TODO: https://github.com/feedzai/fairgbm/issues/5 + } + + // compute pointwise gradients and hessians with implied unit weights +// #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + const auto group = group_[i]; + + // Constraint index + u_short number_of_groups = group_values_.size(); + u_short multipliers_base_index = 0; + + // ------------------------------------------------------------------- + // Skip FPR propagation if label positive, since LPs do not count for FPR constraints + // ------------------------------------------------------------------- + // Grads of proxy constraints w.r.t. the scores: + // (1) 0, if label positive or score <= -margin (default margin=1) + // (2) (m-1) / |LN_group_j| * (margin+score) * sum(lag multipliers except group j), if i belongs to group j whose FPR is maximal + // (3) -lambda_k * (margin+score) / |LN_group_k| if the instance belongs to group k != j (where j has maximal FPR) + // ------------------------------------------------------------------- + if (IsFPRConstrained()) + { + if (label_[i] == 0) + { + const int group_ln = group_label_negatives_.at(group); + + double fpr_constraints_gradient_wrt_pred; + // TODO: https://github.com/feedzai/fairgbm/issues/7 + + // Derivative for hinge-based proxy FPR + if (constraint_stepwise_proxy == "hinge") + fpr_constraints_gradient_wrt_pred = score[i] <= -proxy_margin_ ? 0. : 1. / group_ln; + + // Derivative for BCE-based proxy FPR + else if (constraint_stepwise_proxy == "cross_entropy") { + fpr_constraints_gradient_wrt_pred = (Constrained::sigmoid(score[i] + xent_horizontal_shift)) / group_ln; +// fpr_constraints_gradient_wrt_pred = (Constrained::sigmoid(score[i]) - label_[i]) / group_ln; // without margin + } + + // Loss-function implicitly defined as having a hinge-based derivative (quadratic loss) + else if (constraint_stepwise_proxy == "quadratic") { + fpr_constraints_gradient_wrt_pred = std::max(0., score[i] + proxy_margin_) / group_ln; + } + + else + throw std::invalid_argument("constraint_stepwise_proxy=" + constraint_stepwise_proxy + " not implemented!"); + + // ------------------------------------------------------------------- + // Derivative (2) because instance belongs to group with maximal FPR + // ------------------------------------------------------------------- + if (group == max_proxy_fpr.first) + { + // 2.1) Multiply by (m-1) + fpr_constraints_gradient_wrt_pred *= (number_of_groups - 1.); + + // 2.2) Sum lagrangian multipliers (all except that of group with maximal FPR) + double lag_multipliers = 0; + for (const auto &other_group : group_values_) + { + if (other_group == max_proxy_fpr.first) + continue; + else + lag_multipliers += lagrangian_multipliers[multipliers_base_index + other_group]; + } + + gradients[i] += static_cast(fpr_constraints_gradient_wrt_pred * lag_multipliers); + // hessians[i] += ... + } + else + { + // ---------------------------------------------------------------------- + // Derivative (3) because instance belongs to group with non-maximal FPR + // ---------------------------------------------------------------------- + gradients[i] += static_cast(-1. * fpr_constraints_gradient_wrt_pred * lagrangian_multipliers[multipliers_base_index + group]); + // hessians[i] += ... + } + } + + // Update index of multipliers to be used for next constraints + multipliers_base_index += number_of_groups; + } + + // Skip FNR propagation if label negative, since LNs do not count for FNR constraints + if (IsFNRConstrained()) + { + if (label_[i] == 1) + { + const int group_lp = group_label_positives_.at(group); + + double fnr_constraints_gradient_wrt_pred; + + // Derivative for hinge-based proxy FNR + if (constraint_stepwise_proxy == "hinge") + fnr_constraints_gradient_wrt_pred = score[i] >= proxy_margin_ ? 0. : -1. / group_lp; + + // Derivative for BCE-based proxy FNR + else if (constraint_stepwise_proxy == "cross_entropy") { + fnr_constraints_gradient_wrt_pred = (Constrained::sigmoid(score[i] - xent_horizontal_shift) - 1) / group_lp; +// fnr_constraints_gradient_wrt_pred = (Constrained::sigmoid(score[i]) - label_[i]) / group_lp; // without margin + } + + // Loss-function implicitly defined as having a hinge-based derivative (quadratic loss) + else if (constraint_stepwise_proxy == "quadratic") { + fnr_constraints_gradient_wrt_pred = std::min(0., score[i] - proxy_margin_) / group_lp; + } + + else + throw std::invalid_argument("constraint_stepwise_proxy=" + constraint_stepwise_proxy + " not implemented!"); + + // ------------------------------------------------------------------- + // Derivative (2) because instance belongs to group with max FNR + // ------------------------------------------------------------------- + if (group == max_proxy_fnr.first) + { + // 2.1) Multiply by (m-1) + fnr_constraints_gradient_wrt_pred *= (number_of_groups - 1.0); + + // 2.2) Sum lagrangian multipliers (all except that of group with maximal FNR) + double lag_multipliers = 0; + for (const auto &other_group : group_values_) + { + if (other_group == max_proxy_fnr.first) + continue; + else + lag_multipliers += lagrangian_multipliers[multipliers_base_index + other_group]; + } + + gradients[i] += static_cast(fnr_constraints_gradient_wrt_pred * lag_multipliers); + // hessians[i] += ... + } + else + { + // ---------------------------------------------------------------------- + // Derivative (3) because instance belongs to group with non-maximal FNR + // ---------------------------------------------------------------------- + gradients[i] += static_cast(-1. * fnr_constraints_gradient_wrt_pred * lagrangian_multipliers[multipliers_base_index + group]); + // hessians[i] += ... + } + } + + // Update index of multipliers to be used for next constraints + multipliers_base_index += number_of_groups; + } + + // ** Global Constraints ** + if (IsGlobalFPRConstrained()) + { + if (label_[i] == 0) + { // Condition for non-zero gradient + double global_fpr_constraint_gradient_wrt_pred; + // Gradient for hinge proxy FPR + if (constraint_stepwise_proxy == "hinge") { + global_fpr_constraint_gradient_wrt_pred = score[i] >= -proxy_margin_ ? 1. / total_label_negatives_ : 0.; + } + + // Gradient for BCE proxy FPR + else if (constraint_stepwise_proxy == "cross_entropy") { + global_fpr_constraint_gradient_wrt_pred = (Constrained::sigmoid(score[i] + xent_horizontal_shift)) / total_label_negatives_; +// global_fpr_constraint_gradient_wrt_pred = (Constrained::sigmoid(score[i]) - label_[i]) / total_label_negatives_; // without margin + } + + // Hinge-based gradient (for quadratic proxy FPR) + else if (constraint_stepwise_proxy == "quadratic") { + global_fpr_constraint_gradient_wrt_pred = std::max(0., score[i] + proxy_margin_) / total_label_negatives_; + } + + else + throw std::invalid_argument("constraint_stepwise_proxy=" + constraint_stepwise_proxy + " not implemented!"); + + // Update instance gradient and hessian + gradients[i] += (score_t)(lagrangian_multipliers[multipliers_base_index] * global_fpr_constraint_gradient_wrt_pred); + // hessians[i] += ... + } + + // Update index of multipliers to be used for next constraints + multipliers_base_index += 1; + } + + if (IsGlobalFNRConstrained()) + { + if (label_[i] == 1) + { // Condition for non-zero gradient + double global_fnr_constraint_gradient_wrt_pred; + + // Gradient for hinge proxy FNR + if (constraint_stepwise_proxy == "hinge") { + global_fnr_constraint_gradient_wrt_pred = score[i] >= proxy_margin_ ? 0. : -1. / total_label_positives_; + } + + // Gradient for BCE proxy FNR + else if (constraint_stepwise_proxy == "cross_entropy") { + global_fnr_constraint_gradient_wrt_pred = (Constrained::sigmoid(score[i] - xent_horizontal_shift) - 1) / total_label_positives_; +// global_fnr_constraint_gradient_wrt_pred = (Constrained::sigmoid(score[i]) - label_[i]) / total_label_positives_; // without margin + } + + // Hinge-based gradient (for quadratic proxy FNR) + else if (constraint_stepwise_proxy == "quadratic") { + global_fnr_constraint_gradient_wrt_pred = std::min(0., score[i] - proxy_margin_) / total_label_positives_; + } + + else { + throw std::invalid_argument("constraint_stepwise_proxy=" + constraint_stepwise_proxy + " not implemented!"); + } + + // Update instance gradient and hessian + gradients[i] += (score_t)(lagrangian_multipliers[multipliers_base_index] * + global_fnr_constraint_gradient_wrt_pred); + // hessians[i] += ... + } + + // Update index of multipliers to be used for next constraints + multipliers_base_index += 1; + } + } + } + + bool IsConstrained() const override { return true; } + + // convert score to a probability + void ConvertOutput(const double *input, double *output) const override + { + *output = 1.0f / (1.0f + std::exp(-(*input))); + } + + bool IsFPRConstrained() const + { + return (constraint_type == "FPR" || constraint_type == "FPR,FNR"); + // NOTE: Order of constraints in config file doesn't matter, it's sorted beforehand + } + + bool IsFNRConstrained() const + { + return (constraint_type == "FNR" || constraint_type == "FPR,FNR"); + } + + bool IsGlobalFPRConstrained() const + { + return (global_constraint_type == "FPR" || global_constraint_type == "FPR,FNR"); + } + + bool IsGlobalFNRConstrained() const + { + return (global_constraint_type == "FNR" || global_constraint_type == "FPR,FNR"); + } + + int NumConstraints() const override + { + int group_size = (int)group_values_.size(); + int num_constraints = 0; + + if (IsFPRConstrained()) + num_constraints += group_size; + if (IsFNRConstrained()) + num_constraints += group_size; + if (IsGlobalFPRConstrained()) + num_constraints += 1; + if (IsGlobalFNRConstrained()) + num_constraints += 1; + + return num_constraints; + } + + /*! + * \brief Computes group-wise false positive rate w.r.t. a given probabilities_threshold. + * \param score prediction score in this round (logodds) + * \param probabilities_threshold to consider for computing the FPR + * \group_fpr Output the FPR per group + */ + void ComputeFPR(const double *score, double probabilities_threshold, std::unordered_map &group_fpr) const + { + std::unordered_map false_positives; + std::unordered_map label_negatives; + + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + constraint_group_t group = group_[i]; + + if (label_[i] == 0) + { + label_negatives[group] += 1; + + const double z = 1.0f / (1.0f + std::exp(-score[i])); + if (z >= probabilities_threshold) + false_positives[group] += 1; + } + } + + for (auto group_id : group_values_) + { + double fpr; + if (label_negatives[group_id] == 0) + fpr = 0; + else + fpr = ((double)false_positives[group_id]) / ((double)label_negatives[group_id]); + + group_fpr[group_id] = fpr; + } + } + + /** + * Computes global False-Positive Rate according to the given threshold. + * @param score + * @param probabilities_threshold + * @return the global FNR + */ + double ComputeGlobalFPR(const double *score, double probabilities_threshold) const + { + int false_positives = 0, label_negatives = 0; + + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + if (label_[i] == 0) + { + label_negatives += 1; + + const double z = 1.0f / (1.0f + std::exp(-score[i])); + if (z >= probabilities_threshold) + false_positives += 1; + } + } + + return (double)false_positives / (double)label_negatives; + } + + /*! + * \brief Get hinge-proxy false positive rate w.r.t. a given margin + * \param array of scores -> prediction score in this round + * \param margin to consider for computing the Hinge approximation of FPR + * \group_fpr Output the proxy FPR per group + */ + void ComputeHingeFPR(const double *score, std::unordered_map &group_fpr) const + { + std::unordered_map false_positives; // map of group index to the respective hinge-proxy FPs + std::unordered_map label_negatives; // map of group index to the respective number of LNs + + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + constraint_group_t group = group_[i]; + + // HingeFPR uses only label negatives + if (label_[i] == 0) + { + label_negatives[group] += 1; + + // proxy_margin_ is the line intercept value + const double hinge_score = proxy_margin_ + score[i]; + false_positives[group] += std::max(0.0, hinge_score); + } + } + + for (auto group_id : group_values_) + { + double fpr; + if (label_negatives[group_id] == 0) + fpr = 0; + else + fpr = false_positives[group_id] / label_negatives[group_id]; + + group_fpr[group_id] = fpr; + } + } + + /** + * Compute quadratic-proxy FPR (with a given margin). + * + * Proxy FPR: (1/2) * (H_i + margin)^2 * I[H_i > -margin and y_i == 0] + * + * @param score array of scores + * @param group_fpr hash-map of group to proxy-FPR + */ + void ComputeQuadraticLossFPR(const double *score, std::unordered_map &group_fpr) const + { + std::unordered_map false_positives; // map of group index to the respective proxy FPs + std::unordered_map label_negatives; // map of group index to the respective number of LNs + + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + constraint_group_t group = group_[i]; + + // FPR uses only label NEGATIVES + if (label_[i] == 0 and score[i] > -proxy_margin_) + { // Conditions for non-zero proxy-FPR value + label_negatives[group] += 1; + + // proxy_margin_ corresponds to the symmetric of the function's zero point; f(-proxy_margin_)=0 + const double quadratic_score = (1. / 2.) * std::pow(score[i] + proxy_margin_, 2); + assert(quadratic_score >= 0.); + false_positives[group] += quadratic_score; + } + } + + for (auto group_id : group_values_) + { + double fpr; + if (label_negatives[group_id] == 0) + fpr = 0; + else + fpr = false_positives[group_id] / label_negatives[group_id]; + + group_fpr[group_id] = fpr; + } + } + + /** + * Compute cross-entropy-proxy FPR. + * Function: + * l(a) = log(1 + exp( a + log(exp(b) - 1) )), where b = proxy_margin_ = l(0) + * + * @param score array of scores + * @param group_fpr hash-map of group to proxy-FPR + */ + void ComputeXEntropyLossFPR(const double *score, std::unordered_map &group_fpr) const + { + std::unordered_map false_positives; // map of group index to the respective proxy FPs + std::unordered_map label_negatives; // map of group index to the respective number of LNs + double xent_horizontal_shift = log(exp(proxy_margin_) - 1); + + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + constraint_group_t group = group_[i]; + + // FPR uses only label NEGATIVES + if (label_[i] == 0) + { + label_negatives[group] += 1; + + // proxy_margin_ corresponds to the vertical margin at x=0; l(0) = proxy_margin_ + const double xent_score = log(1 + exp(score[i] + xent_horizontal_shift)); + assert(xent_score >= 0.); + false_positives[group] += xent_score; + } + } + + for (auto group_id : group_values_) + { + double fpr; + if (label_negatives[group_id] == 0) + fpr = 0; + else + fpr = false_positives[group_id] / label_negatives[group_id]; + + group_fpr[group_id] = fpr; + } + } + + /*! + * \brief Computes group-wise false negative rate w.r.t. a given probabilities_threshold. + * \param score prediction score in this round (log-odds) + * \param probabilities_threshold to consider for computing the FNR + * \group_fnr Output the FNR per group + */ + void ComputeFNR(const double *score, double probabilities_threshold, std::unordered_map &group_fnr) const + { + std::unordered_map false_negatives; + std::unordered_map label_positives; + + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + constraint_group_t group = group_[i]; + + if (label_[i] == 1) + { + label_positives[group] += 1; + + const double z = 1.0f / (1.0f + std::exp(-score[i])); + if (z < probabilities_threshold) + false_negatives[group] += 1; + } + } + + for (auto group_id : group_values_) + { + double fnr; + if (label_positives[group_id] == 0) + fnr = 0; + else + fnr = ((double)false_negatives[group_id]) / ((double)label_positives[group_id]); + group_fnr[group_id] = fnr; + } + }; + + /** + * Computes global False-Negative Rate according to the given threshold. + * @param score + * @param probabilities_threshold + * @return the global FNR + */ + double ComputeGlobalFNR(const double *score, double probabilities_threshold) const + { + int false_negatives = 0, label_positives = 0; + + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + if (label_[i] == 1) + { + label_positives += 1; + + const double z = 1.0f / (1.0f + std::exp(-score[i])); + if (z < probabilities_threshold) + false_negatives += 1; + } + } + + return (double)false_negatives / (double)label_positives; + } + + /*! + * \brief Get hinge-proxy FNR w.r.t. a given margin. + * \param score prediction score in this round + * \param margin to consider for computing the FNR + * \group_fnr Output the proxy FNR per group + */ + void ComputeHingeLossFNR(const double *score, std::unordered_map &group_fnr) const + { + std::unordered_map false_negatives; // map of group index to the respective hinge-proxy FNs + std::unordered_map label_positives; + + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + constraint_group_t group = group_[i]; + + if (label_[i] == 1) + { + label_positives[group] += 1; + + const double hinge_score = proxy_margin_ - score[i]; + false_negatives[group] += std::max(0.0, hinge_score); + } + } + + for (auto group_id : group_values_) + { + double fnr; + if (label_positives[group_id] == 0) + fnr = 0; + else + fnr = false_negatives[group_id] / label_positives[group_id]; + group_fnr[group_id] = fnr; + } + }; + + /** + * Compute quadratic-proxy FNR (with a given margin). + * + * Proxy FNR: (1/2) * (H_i - margin)^2 * I[H_i < margin and y_i == 1] + * + * @param score array of scores + * @param group_fnr hash-map of group to proxy-FNR + */ + void ComputeQuadraticLossFNR(const double *score, std::unordered_map &group_fnr) const + { + std::unordered_map false_negatives; // map of group index to the respective proxy FPs + std::unordered_map label_positives; // map of group index to the respective number of LNs + + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + constraint_group_t group = group_[i]; + + // FNR uses only label POSITIVES + if (label_[i] == 1 and score[i] < proxy_margin_) + { // Conditions for non-zero proxy-FNR value + label_positives[group] += 1; + + // proxy_margin_ corresponds to the function's zero point; f(proxy_margin_)=0 + const double quadratic_score = (1. / 2.) * std::pow(score[i] - proxy_margin_, 2); + assert(quadratic_score >= 0.); + false_negatives[group] += quadratic_score; + } + } + + for (auto group_id : group_values_) + { + double fnr; + if (label_positives[group_id] == 0) + fnr = 0; + else + fnr = false_negatives[group_id] / label_positives[group_id]; + + group_fnr[group_id] = fnr; + } + } + + /** + * Compute cross-entropy-proxy FNR. + * Function: + * l(a) = log(1 + exp( -a + log(exp(b) - 1) )), where b = proxy_margin_ = l(0) + * + * @param score array of scores + * @param group_fnr hash-map of group to proxy-FNR + */ + void ComputeXEntropyLossFNR(const double *score, std::unordered_map &group_fnr) const + { + std::unordered_map false_negatives; // map of group index to the respective proxy FPs + std::unordered_map label_positives; // map of group index to the respective number of LNs + double xent_horizontal_shift = log(exp(proxy_margin_) - 1); + + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + constraint_group_t group = group_[i]; + + // FNR uses only label POSITIVES + if (label_[i] == 1) + { + label_positives[group] += 1; + + // proxy_margin_ corresponds to the vertical margin at x=0; l(0) = proxy_margin_ + const double xent_score = log(1 + exp(xent_horizontal_shift - score[i])); + assert(xent_score >= 0.); + false_negatives[group] += xent_score; + } + } + + for (auto group_id : group_values_) + { + double fnr; + if (label_positives[group_id] == 0) + fnr = 0; + else + fnr = false_negatives[group_id] / label_positives[group_id]; + + group_fnr[group_id] = fnr; + } + } + + /*! + * \brief Get label positive and label negative counts. + */ + void ComputeLabelCounts() + { + // #pragma omp parallel for schedule(static) // TODO: https://github.com/feedzai/fairgbm/issues/6 + for (data_size_t i = 0; i < num_data_; ++i) + { + if (label_[i] == 1) + { + this->group_label_positives_[group_[i]] += 1; + this->total_label_positives_ += 1; + } + + else if (label_[i] == 0) + { + this->group_label_negatives_[group_[i]] += 1; + this->total_label_negatives_ += 1; + } + + else + throw std::runtime_error("invalid label type"); + } + }; + +protected: + static std::string ValidateProxyFunctionName(std::string func_name, bool allow_empty = false) + { + std::transform(func_name.begin(), func_name.end(), func_name.begin(), ::tolower); + if (func_name == "bce" or func_name == "xentropy" or func_name == "entropy") + func_name = "cross_entropy"; + + if (not( + func_name == "hinge" or + func_name == "quadratic" or + func_name == "cross_entropy" or + (allow_empty and func_name.empty()))) + { + throw std::invalid_argument("Got invalid proxy function: '" + func_name + "'"); + } + + return func_name; + } + + /*! \brief Number of data points */ + data_size_t num_data_; + /*! \brief Pointer for label */ + const label_t *label_; + /*! \brief Weights for data */ + const label_t *weights_; + + /*! \brief Pointer for group */ + const constraint_group_t *group_; + /*! \brief Unique group values */ + std::vector group_values_; + + /*! \brief Label positives per group */ + std::unordered_map group_label_positives_; + /*! \brief Label Negatives per group */ + std::unordered_map group_label_negatives_; + + /*! \brief Total number of Label Positives */ + int total_label_positives_ = 0; + + /*! \brief Total number of Label Negatives */ + int total_label_negatives_ = 0; + + /*! \brief Type of constraint */ + std::string constraint_type; + + /*! \brief Function to use as a proxy for the step-wise function in CONSTRAINTS. */ + std::string constraint_stepwise_proxy; + + /*! \brief Function to use as a proxy for the step-wise function in the OBJECTIVE. */ + std::string objective_stepwise_proxy; + + /*! \brief Score threshold to compute confusion matrix (over predicted probabilities) */ + score_t score_threshold_ = 0.5; + + /*! \brief FPR threshold used in FPR constraints (small margin for constraint fulfillment) */ + score_t fpr_threshold_ = 0.0; + + /*! \brief FNR threshold used in FNR constraints (small margin for constraint fulfillment) */ + score_t fnr_threshold_ = 0.0; + + /*! \brief Margin threshold used in the Hinge approximation */ + score_t proxy_margin_ = 1.0; + + /*! \brief Type of global constraint */ + std::string global_constraint_type; + + /*! \brief Target value for the global FPR constraint */ + score_t global_target_fpr_; + + /*! \brief Target value for the global FNR constraint */ + score_t global_target_fnr_; + + /*! \brief Score threshold used for the global constraints */ + score_t global_score_threshold_ = 0.5; + + /*! \brief Where to save debug files to */ + std::string debugging_output_dir_; +}; +} // namespace LightGBM #endif // LightGBM_OBJECTIVE_FUNCTION_H_ diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 43573573d..c9609f038 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -455,8 +455,8 @@ inline static std::vector StringToArrayFast(const std::string& str, int n) { return ret; } -template -inline static std::string Join(const std::vector& strs, const char* delimiter, const bool force_C_locale = false) { +template +inline static std::string Join(const std::vector& strs, const char* delimiter, const bool force_C_locale = false) { if (strs.empty()) { return std::string(""); } @@ -1205,7 +1205,7 @@ struct __TToStringHelper { * Converts an array to a string with with values separated by the space character. * This method replaces Common's ``ArrayToString`` and ``ArrayToStringFast`` functionality * and is locale-independent. -* +* * \note If ``high_precision_output`` is set to true, * floating point values are output with more digits of precision. */ diff --git a/include/LightGBM/utils/constrained.hpp b/include/LightGBM/utils/constrained.hpp new file mode 100644 index 000000000..a8ce73a32 --- /dev/null +++ b/include/LightGBM/utils/constrained.hpp @@ -0,0 +1,78 @@ +/** + * The copyright of this file belongs to Feedzai. The file cannot be + * reproduced in whole or in part, stored in a retrieval system, + * transmitted in any form, or by any means electronic, mechanical, + * photocopying, or otherwise, without the prior permission of the owner. + * + * (c) 2021 Feedzai, Strictly Confidential + */ + +#ifndef LIGHTGBM_CONSTRAINED_HPP +#define LIGHTGBM_CONSTRAINED_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace LightGBM { +namespace Constrained { + +/** + * Standard sigmoid mathematical function. + * @param x the input to the function. + * @return the sigmoid of the input. + */ +inline double sigmoid(double x) { + return 1. / (1. + std::exp(-x)); +} + +/** + * Finds the (key, value) pair with highest value. + * @tparam Key The type of the Map Key. + * @tparam Value The type of the Map Value. + * @param x Reference to the map to search over. + * @return The pair with highest value V. + */ +template +std::pair findMaxValuePair(std::unordered_map const &x) +{ + return *std::max_element( + x.begin(), x.end(), + [](const std::pair &p1, const std::pair &p2) { + return p1.second < p2.second; + } + ); +} + +/** + * Writes the given values to the end of the given file. + * @tparam T The type of values in the input vector. + * @tparam Allocator The type of allocator in the input vector. + * @param dir The directory of the file to write on. + * @param filename The name of the file to write on. + * @param values A vector of the values to append to the file. + */ +template> +void write_values(const std::string& dir, const std::string& filename, + std::vector values) { + struct stat buf; + + std::string filename_path = dir + "/" + filename; + bool file_exists = (stat(filename_path.c_str(), &buf) != -1); + + std::ofstream outfile; + outfile.open(filename_path, std::ios::out | (file_exists ? std::ios::app : std::ios::trunc)); + outfile << LightGBM::Common::Join(values, ",") << std::endl; + + outfile.close(); +} +} +} + +#endif //LIGHTGBM_CONSTRAINED_HPP diff --git a/src/application/application.cpp b/src/application/application.cpp index e82cfcada..c0276d8e3 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -49,6 +49,11 @@ Application::~Application() { } } +/** + * Loads the application config, either as command-line arguments or from a config file. + * @param argc Number of command-line arguments. + * @param argv Array of strings containing the command-line arguments. A common element is "config=". + */ void Application::LoadParameters(int argc, char** argv) { std::unordered_map params; for (int i = 1; i < argc; ++i) { diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index d393d46d5..33959475c 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ GBDT::GBDT() average_output_ = false; tree_learner_ = nullptr; linear_tree_ = false; + debugging_output_dir_ = "."; } GBDT::~GBDT() { @@ -83,14 +85,34 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective if (objective_function_ != nullptr) { num_tree_per_iteration_ = objective_function_->NumModelPerIteration(); if (objective_function_->IsRenewTreeOutput() && !config->monotone_constraints.empty()) { - Log::Fatal("Cannot use ``monotone_constraints`` in %s objective, please disable it.", objective_function_->GetName()); + Log::Fatal("Cannot use ``monotone_constraints`` in %s objective, please disable it.", + objective_function_->GetName()); } } is_constant_hessian_ = GetIsConstHessian(objective_function); - tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, - config_.get())); + // load output dir + debugging_output_dir_ = config->debugging_output_dir; + + // constraint configurations + is_constrained_ = objective_function->IsConstrained(); + lagrangian_learning_rate_ = config_->multiplier_learning_rate; + + int num_constraints = objective_function->NumConstraints(); + + // If no Lagrangian multipliers are specified, start everything at zero + if ((config->init_lagrangian_multipliers).empty()) { + std::vector default_lag_multipliers(num_constraints, 0); + lagrangian_multipliers_.push_back(default_lag_multipliers); + } else { + CHECK_EQ(num_constraints, (int) config->init_lagrangian_multipliers.size()); + lagrangian_multipliers_.push_back(config->init_lagrangian_multipliers); + } + + tree_learner_ = std::unique_ptr( + TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get()) + ); // init tree learner tree_learner_->Init(train_data_, is_constant_hessian_); @@ -167,15 +189,51 @@ void GBDT::AddValidDataset(const Dataset* valid_data, } } +/** + * @brief Computes gradients and hessians. + */ void GBDT::Boosting() { Common::FunctionTimer fun_timer("GBDT::Boosting", global_timer); if (objective_function_ == nullptr) { Log::Fatal("No object function provided"); } - // objective function will calculate gradients and hessians + // Objective function will calculate gradients and hessians int64_t num_score = 0; - objective_function_-> - GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data()); + + // Get predictions for all instances - in log-odds! + const double *score = GetTrainingScore(&num_score); + + // GetGradients computes only gradients/hessians from the predictive loss! + // and will change the gradients and hessians variables in place. + objective_function_->GetGradients(score, gradients_.data(), hessians_.data()); + // ^ a.k.a. GetPredictiveLossGradientsWRTModelOutput + + if (is_constrained_) { + auto constrained_objective_function = dynamic_cast(objective_function_); + + // Compute the contribution of the constraints for the Lagrangian! + // (as we're in the descent step, this may use the proxy constraints) + constrained_objective_function->GetConstraintGradientsWRTModelOutput( + lagrangian_multipliers_.back().data(), + score, gradients_.data(), hessians_.data()); + // ^ will change gradients and hessians in place + // + // NOTE: lagrangian_multipliers is a vector of vectors - each element represents the multipliers at a given iteration; + // TODO: https://github.com/feedzai/fairgbm/issues/8 + +#ifdef DEBUG + // Dump lagrangian multipliers + Constrained::write_values(debugging_output_dir_, "lagrangian_multipliers.dat", lagrangian_multipliers_.back()); + + // Dump the gradients of the Lagrangian (grads of loss + grads of constraints) + Constrained::write_values>( + debugging_output_dir_, "gradients.lagrangian.dat", gradients_); + + // Dump hessians, we don't currently use them though :P + Constrained::write_values>( + debugging_output_dir_, "hessians.lagrangian.dat", hessians_); +#endif + } } data_size_t GBDT::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) { @@ -263,13 +321,20 @@ void GBDT::Bagging(int iter) { void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { Common::FunctionTimer fun_timer("GBDT::Train", global_timer); - bool is_finished = false; + bool is_finished = false, is_finished_lagrangian = false; auto start_time = std::chrono::steady_clock::now(); - for (int iter = 0; iter < config_->num_iterations && !is_finished; ++iter) { + + for (int iter = 0; iter < config_->num_iterations and (!is_finished or !is_finished_lagrangian); ++iter) { + + // Do one training iteration + // - execute a descent step on the loss function; + // - (optionally) execute an ascent step w.r.t. the Lagrangian multipliers (only if using constrained optim.) is_finished = TrainOneIter(nullptr, nullptr); + if (!is_finished) { is_finished = EvalAndCheckEarlyStopping(); } + auto end_time = std::chrono::steady_clock::now(); // output used time per iteration Log::Info("%f seconds elapsed, finished iteration %d", std::chrono::durationGetName()) == std::string("regression_l1") || std::string(objective_function_->GetName()) == std::string("quantile") || std::string(objective_function_->GetName()) == std::string("mape")) { - Log::Warning("Disabling boost_from_average in %s may cause the slow convergence", objective_function_->GetName()); + Log::Warning("Disabling boost_from_average in %s may cause the slow convergence", + objective_function_->GetName()); } } return 0.0f; } +/** + * @brief Descent step! + * + * @param gradients + * @param hessians + * @return whether training has ended + */ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { Common::FunctionTimer fun_timer("GBDT::TrainOneIter", global_timer); + + // Step 1. Initialize vector to store the scores for each tree in the iteration + // (boosting always uses 1 tree per iteration) std::vector init_scores(num_tree_per_iteration_, 0.0); + + // Step 2. Add first weak learner (predict the average value, aka, BoostFromAverage) // boosting first if (gradients == nullptr || hessians == nullptr) { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true); } + + // Step 2.1. Compute gradients and hessians Boosting(); gradients = gradients_.data(); hessians = hessians_.data(); } + + // Step 3. Run bagging // bagging logic - Bagging(iter_); + Bagging(iter_); // e.g., run GOSS if LightGBM (or Bagging only if RF) + // Step 4. Fit a weak learner (if RF, will run one split for multiple trees) bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { const size_t offset = static_cast(cur_tree_id) * num_data_; - std::unique_ptr new_tree(new Tree(2, false, false)); + + // Step 4.1. Create a new tree + std::unique_ptr new_tree(new Tree(2, false, false)); // this is a placeholder pointer + + // class_need_train_ will keep tabs of which trees in an RF haven't finished training if (class_need_train_[cur_tree_id] && train_data_->num_features() > 0) { auto grad = gradients + offset; auto hess = hessians + offset; + // need to copy gradients for bagging subset. if (is_use_subset_ && bag_data_cnt_ < num_data_) { for (int i = 0; i < bag_data_cnt_; ++i) { @@ -397,24 +485,41 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { grad = gradients_.data() + offset; hess = hessians_.data() + offset; } + + // Step 4.2. Train the tree (use grads and hessians) bool is_first_tree = models_.size() < static_cast(num_tree_per_iteration_); new_tree.reset(tree_learner_->Train(grad, hess, is_first_tree)); } + // We found at least a split! if (new_tree->num_leaves() > 1) { should_continue = true; + // Get current scores of each instance in dataset, given by this tree (aka offset) auto score_ptr = train_score_updater_->score() + offset; - auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast(label[i]) - score_ptr[i]; }; + // Get residual of each instance in dataset (label - predict) + auto residual_getter = [score_ptr](const label_t* label, int i) { + return static_cast(label[i]) - score_ptr[i]; + }; + // Recompute tree leaf values given the specific objective function tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter, num_data_, bag_data_indices_.data(), bag_data_cnt_); + // shrinkage by learning rate new_tree->Shrinkage(shrinkage_rate_); + // update score UpdateScore(new_tree.get(), cur_tree_id); + + // Add bias if any was computed (from BoostFromAverage) + // (should only add for the first boosting iteration) if (std::fabs(init_scores[cur_tree_id]) > kEpsilon) { new_tree->AddBias(init_scores[cur_tree_id]); } - } else { + } + + // The tree wasn't grown <=> we didn't find a split w/ positive IG + // This means that the prediction will be simply to boost from average. + else { // only add default score one-time if (models_.size() < static_cast(num_tree_per_iteration_)) { double output = 0.0; @@ -425,6 +530,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } else { output = init_scores[cur_tree_id]; } + // Add a constraint-value tree new_tree->AsConstantTree(output); // updates scores train_score_updater_->AddScore(output, cur_tree_id); @@ -447,10 +553,60 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { return true; } + // Only if running constrained optimization! + // Ascent step: update value of Lagrangian multipliers + if (is_constrained_) { + TrainLagrangianOneIter(nullptr, nullptr); + } + ++iter_; return false; } +// TODO: https://github.com/feedzai/fairgbm/issues/7 +// - implement normalization / bound on multipliers; +// - implement early stopping criteria (convergence fulfilled); +/*! +* \brief Gradient ascent step w.r.t. Lagrangian multipliers (used only for constrained optimization) +* \param gradients nullptr for using default objective, otherwise use self-defined boosting +* \param hessians nullptr for using default objective, otherwise use self-defined boosting +* \return True if cannot train anymore (or training has ended due to early stopping) +*/ +bool GBDT::TrainLagrangianOneIter(const score_t* /* gradients */, const score_t* /* hessians */) { + auto constrained_objective_function = dynamic_cast(objective_function_); + + int64_t num_score = 0; + // Get Lagrangian gradients w.r.t. multipliers + auto lag_updates = constrained_objective_function->GetLagrangianGradientsWRTMultipliers( + GetTrainingScore(&num_score)); + + // Get Lagrangian multipliers of the latest iteration + auto current_lag_multipliers = lagrangian_multipliers_.back(); + + // Initialize updated lagrangian multipliers w/ previous value + std::vector updated_lag_multipliers(current_lag_multipliers); + + // Gradient ascent in Lagrangian multipliers (or constraint space) + for (uint i = 0; i < lag_updates.size(); i++) { + updated_lag_multipliers[i] += lagrangian_learning_rate_ * lag_updates[i]; + + // Ensuring multipliers >= 0 -> using *INEQUALITY* constraints! c(theta) <= 0 + updated_lag_multipliers[i] = std::max(0.0, updated_lag_multipliers[i]); + // NOTE + // - This aims to guarantee that the problem remains bounded, which is true, + // provided the Lagrangian multiplier remains >= 0; + // - If multipliers are allowed to go negative -> using *EQUALITY* constraints! + } + lagrangian_multipliers_.push_back(updated_lag_multipliers); + +#ifdef DEBUG + // Log constraints violation to file + Constrained::write_values(debugging_output_dir_, "functions_evals.dat", lag_updates); +#endif + + return false; +} + void GBDT::RollbackOneIter() { if (iter_ <= 0) { return; } // reset score @@ -466,6 +622,11 @@ void GBDT::RollbackOneIter() { for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { models_.pop_back(); } + + // remove lagrangian multipliers if constrained objective + if (is_constrained_) + lagrangian_multipliers_.pop_back(); + --iter_; } @@ -488,6 +649,7 @@ bool GBDT::EvalAndCheckEarlyStopping() { return is_met_early_stopping; } +// This method updates the current (total) score associated with each instance void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer); // update training score @@ -496,7 +658,11 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { // we need to predict out-of-bag scores of data for boosting if (num_data_ - bag_data_cnt_ > 0) { - train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id); + train_score_updater_->AddScore( + tree, + bag_data_indices_.data() + bag_data_cnt_, + num_data_ - bag_data_cnt_, + cur_tree_id); } } else { @@ -611,7 +777,8 @@ void GBDT::PredictContrib(const double* features, double* output) const { for (int i = start_iteration_for_pred_; i < end_iteration_for_pred; ++i) { // predict all the trees for one iteration for (int k = 0; k < num_tree_per_iteration_; ++k) { - models_[i * num_tree_per_iteration_ + k]->PredictContrib(features, num_features, output + k*(num_features + 1)); + models_[i * num_tree_per_iteration_ + k]->PredictContrib( + features, num_features, output + k*(num_features + 1)); } } } @@ -691,7 +858,9 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* if (objective_function_ != nullptr) { CHECK_EQ(num_tree_per_iteration_, objective_function_->NumModelPerIteration()); if (objective_function_->IsRenewTreeOutput() && !config_->monotone_constraints.empty()) { - Log::Fatal("Cannot use ``monotone_constraints`` in %s objective, please disable it.", objective_function_->GetName()); + Log::Fatal( + "Cannot use ``monotone_constraints`` in %s objective, please disable it.", + objective_function_->GetName()); } } is_constant_hessian_ = GetIsConstHessian(objective_function); @@ -746,8 +915,11 @@ void GBDT::ResetConfig(const Config* config) { if (!config->feature_contri.empty()) { CHECK_EQ(static_cast(train_data_->num_total_features()), config->feature_contri.size()); } - if (objective_function_ != nullptr && objective_function_->IsRenewTreeOutput() && !config->monotone_constraints.empty()) { - Log::Fatal("Cannot use ``monotone_constraints`` in %s objective, please disable it.", objective_function_->GetName()); + if (objective_function_ != nullptr && objective_function_->IsRenewTreeOutput() && + !config->monotone_constraints.empty()) { + Log::Fatal( + "Cannot use ``monotone_constraints`` in %s objective, please disable it.", + objective_function_->GetName()); } early_stopping_round_ = new_config->early_stopping_round; shrinkage_rate_ = new_config->learning_rate; @@ -785,8 +957,11 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { if ((config->bagging_fraction < 1.0 || balance_bagging_cond) && config->bagging_freq > 0) { need_re_bagging_ = false; if (!is_change_dataset && - config_.get() != nullptr && config_->bagging_fraction == config->bagging_fraction && config_->bagging_freq == config->bagging_freq - && config_->pos_bagging_fraction == config->pos_bagging_fraction && config_->neg_bagging_fraction == config->neg_bagging_fraction) { + config_.get() != nullptr && + config_->bagging_fraction == config->bagging_fraction && + config_->bagging_freq == config->bagging_freq && + config_->pos_bagging_fraction == config->pos_bagging_fraction && + config_->neg_bagging_fraction == config->neg_bagging_fraction) { return; } if (balance_bagging_cond) { diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index a99b5fb9a..34acfdf4f 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -146,6 +146,14 @@ class GBDT : public GBDTBase { */ bool TrainOneIter(const score_t* gradients, const score_t* hessians) override; + /*! + * \brief Training logic for the constrained optimization step. + * \param gradients nullptr for using default objective, otherwise use self-defined boosting + * \param hessians nullptr for using default objective, otherwise use self-defined boosting + * \return True if cannot train any more + */ + bool TrainLagrangianOneIter(const score_t* gradients, const score_t* hessians) override; + /*! * \brief Rollback one iteration */ @@ -534,6 +542,18 @@ class GBDT : public GBDTBase { ParallelPartitionRunner bagging_runner_; Json forced_splits_json_; bool linear_tree_; + + /*! \brief Whether we're running constrained optimization */ + bool is_constrained_; + + /*! \brief Shrinkage rate for the Ascent step */ + double lagrangian_learning_rate_; + + /*! \brief Lagrangian multiplier(s) per iteration */ + std::vector> lagrangian_multipliers_; // TODO: https://github.com/feedzai/fairgbm/issues/8 + + /*! \brief Output directory to store debug files (e.g., gradients/hessians) */ + std::string debugging_output_dir_; }; } // namespace LightGBM diff --git a/src/io/config.cpp b/src/io/config.cpp index 9c91f9a24..d159358cf 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -226,10 +226,10 @@ void Config::Set(const std::unordered_map& params) { std::sort(eval_at.begin(), eval_at.end()); std::vector new_valid; - for (size_t i = 0; i < valid.size(); ++i) { - if (valid[i] != data) { + for (const auto & i : valid) { + if (i != data) { // Only push the non-training data - new_valid.push_back(valid[i]); + new_valid.push_back(i); } else { is_provide_training_metric = true; } @@ -253,6 +253,10 @@ void Config::Set(const std::unordered_map& params) { // check for conflicts CheckParamConflict(); + +#ifdef DEBUG + Log::Debug("Loading configs from Map; constraint_group_column=%s\n", this->constraint_group_column.c_str()); +#endif } bool CheckMultiClassObjective(const std::string& objective) { diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 7eb368a06..82e2c46b1 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -166,6 +166,22 @@ const std::unordered_map& Config::alias_table() { {"mlist", "machine_list_filename"}, {"workers", "machines"}, {"nodes", "machines"}, + + // FairGBM parameters + {"lagrangian_learning_rate", "multiplier_learning_rate"}, + {"lagrangian_multiplier_learning_rate", "multiplier_learning_rate"}, + {"lagrangian_multipliers", "init_lagrangian_multipliers"}, + {"init_multipliers", "init_lagrangian_multipliers"}, + {"output_dir", "debugging_output_dir"}, + {"constraint_proxy_function", "constraint_stepwise_proxy"}, + {"constraint_stepwise_proxy_function", "constraint_stepwise_proxy"}, + {"objective_proxy_function", "objective_stepwise_proxy"}, + {"objective_stepwise_proxy_function", "objective_stepwise_proxy"}, + {"proxy_margin", "stepwise_proxy_margin"}, + {"global_fpr", "global_target_fpr"}, + {"target_global_fpr", "global_target_fpr"}, + {"global_fnr", "global_target_fnr"}, + {"target_global_fnr", "global_target_fnr"}, }); return aliases; } @@ -302,6 +318,23 @@ const std::unordered_set& Config::parameter_set() { "gpu_device_id", "gpu_use_dp", "num_gpu", + + // FairGBM parameters + "debugging_output_dir", + "constraint_type", + "constraint_stepwise_proxy", + "objective_stepwise_proxy", + "stepwise_proxy_margin", + "constraint_group_column", + "constraint_fpr_threshold", + "constraint_fnr_threshold", + "score_threshold", + "init_lagrangian_multipliers", + "multiplier_learning_rate", + "global_constraint_type", + "global_target_fpr", + "global_target_fnr", + "global_score_threshold" }); return params; } @@ -627,6 +660,49 @@ void Config::GetMembersFromString(const std::unordered_map(tmp_str, ','); + for (auto lag : init_lagrangian_multipliers) + CHECK_GE(lag, 0); + } + + // Parameters for global constraints + Config::GetString(params, "global_constraint_type", &global_constraint_type); + + Config::GetDouble(params, "global_target_fpr", &global_target_fpr); + CHECK_GE(global_target_fpr, 0); CHECK_LE(global_target_fpr, 1); + + Config::GetDouble(params, "global_target_fnr", &global_target_fnr); + CHECK_GE(global_target_fnr, 0); CHECK_LE(global_target_fnr, 1); + + Config::GetDouble(params, "global_score_threshold", &global_score_threshold); + CHECK_GE(global_score_threshold, 0); CHECK_LE(global_score_threshold, 1); } std::string Config::SaveMembersToString() const { @@ -735,6 +811,27 @@ std::string Config::SaveMembersToString() const { str_buf << "[gpu_device_id: " << gpu_device_id << "]\n"; str_buf << "[gpu_use_dp: " << gpu_use_dp << "]\n"; str_buf << "[num_gpu: " << num_gpu << "]\n"; + + str_buf << "[------- FAIRGBM ------]\n"; + str_buf << "[debugging_output_dir: " << debugging_output_dir << "]\n"; + str_buf << "[constraint_type: " << constraint_type << "]\n"; + str_buf << "[stepwise_proxy_margin: " << stepwise_proxy_margin << "]\n"; + str_buf << "[constraint_group_column: " << constraint_group_column << "]\n"; + str_buf << "[score_threshold: " << score_threshold << "]\n"; + str_buf << "[constraint_fpr_threshold: " << constraint_fpr_threshold << "]\n"; + str_buf << "[constraint_fnr_threshold: " << constraint_fnr_threshold << "]\n"; + str_buf << "[multiplier_learning_rate: " << multiplier_learning_rate << "]\n"; + str_buf << "[init_lagrangian_multipliers: " << Common::Join(init_lagrangian_multipliers, ",") << "]\n"; + + // Global constraint parameters + str_buf << "[global_constraint_type: " << global_constraint_type << "]\n"; + str_buf << "[global_target_fpr: " << global_target_fpr << "]\n"; + str_buf << "[global_target_fnr: " << global_target_fnr << "]\n"; + str_buf << "[global_score_threshold: " << global_score_threshold << "]\n"; + + // TODO -- Add option to normalize multipliers + // str_buf << "[normalize_lagrangian_multipliers: "; + return str_buf.str(); } diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index e5cabe682..30556834d 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -33,7 +33,7 @@ Dataset::Dataset(data_size_t num_data) { CHECK_GT(num_data, 0); data_filename_ = "noname"; num_data_ = num_data; - metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC); + metadata_.Init(num_data_, NO_SPECIFIC, NO_SPECIFIC, NO_SPECIFIC); is_finish_load_ = false; group_bin_boundaries_.push_back(0); has_raw_ = false; @@ -850,8 +850,13 @@ bool Dataset::SetFloatField(const char* field_name, const float* field_data, #else metadata_.SetWeights(field_data, num_element); #endif + } else if (name == std::string("constraint_group") || + name == std::string("fairness_group") || + name == std::string("sensitive_group") || + name == std::string("protected_group")) { + metadata_.SetConstraintGroup(field_data, num_element); } else { - return false; + return false; // Not successful } return true; } diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index c51f8a4fd..1176b381f 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -22,6 +22,7 @@ DatasetLoader::DatasetLoader(const Config& io_config, const PredictFunction& pre label_idx_ = 0; weight_idx_ = NO_SPECIFIC; group_idx_ = NO_SPECIFIC; + constraint_group_idx_ = NO_SPECIFIC; SetHeader(filename); store_raw_ = false; if (io_config.linear_tree) { @@ -143,7 +144,28 @@ void DatasetLoader::SetHeader(const char* filename) { } ignore_features_.emplace(group_idx_); } + + // load constraint group column idx + if (config_.constraint_group_column.size() > 0) { + if (Common::StartsWith(config_.constraint_group_column, name_prefix)) { + std::string name = config_.constraint_group_column.substr(name_prefix.size()); + if (name2idx.count(name) > 0) { + constraint_group_idx_ = name2idx[name]; + Log::Info("Using column %s as constraint_group id", name.c_str()); + } else { + Log::Fatal("Could not find constraint_group column %s in data file", name.c_str()); + } + } else { + if (!Common::AtoiAndCheck(config_.constraint_group_column.c_str(), &constraint_group_idx_)) { + Log::Fatal("constraint_group_column is not a number,\n" + "if you want to use a column name,\n" + "please add the prefix \"name:\" to the column name"); + } + Log::Info("Using column number %d as constraint_group id", constraint_group_idx_); + } + } } + if (config_.categorical_feature.size() > 0) { if (Common::StartsWith(config_.categorical_feature, name_prefix)) { std::string names = config_.categorical_feature.substr(name_prefix.size()); @@ -217,7 +239,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac dataset->ResizeRaw(dataset->num_data_); } // initialize label - dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); + dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_, constraint_group_idx_); // extract features ExtractFeaturesFromMemory(&text_data, parser.get(), dataset.get()); text_data.clear(); @@ -237,7 +259,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac dataset->ResizeRaw(dataset->num_data_); } // initialize label - dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); + dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_, constraint_group_idx_); Log::Info("Making second pass..."); // extract features ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get()); @@ -279,7 +301,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, 0, 1, &num_global_data, &used_data_indices); dataset->num_data_ = static_cast(text_data.size()); // initialize label - dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); + dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_, constraint_group_idx_); dataset->CreateValid(train_data); if (dataset->has_raw()) { dataset->ResizeRaw(dataset->num_data_); @@ -293,7 +315,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, dataset->num_data_ = static_cast(text_reader.CountLine()); num_global_data = dataset->num_data_; // initialize label - dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_); + dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_, constraint_group_idx_); dataset->CreateValid(train_data); if (dataset->has_raw()) { dataset->ResizeRaw(dataset->num_data_); @@ -996,6 +1018,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_); CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_); CHECK(group_idx_ < 0 || group_idx_ < dataset->num_total_features_); + CHECK(constraint_group_idx_ == NO_SPECIFIC || (constraint_group_idx_ >= 0 && constraint_group_idx_ < dataset->num_total_features_)); // FairGBM // fill feature_names_ if not header if (feature_names_.empty()) { @@ -1178,6 +1201,8 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector* text_dat dataset->metadata_.SetQueryAt(i, static_cast(inner_data.second)); } } + if (inner_data.first == constraint_group_idx_) + dataset->metadata_.SetConstraintGroupAt(i, static_cast(inner_data.second)); } if (dataset->has_raw()) { for (size_t j = 0; j < feature_row.size(); ++j) { @@ -1235,6 +1260,8 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector* text_dat dataset->metadata_.SetQueryAt(i, static_cast(inner_data.second)); } } + if (inner_data.first == constraint_group_idx_) + dataset->metadata_.SetConstraintGroupAt(i, static_cast(inner_data.second)); } dataset->FinishOneRow(tid, i, is_feature_added); if (dataset->has_raw()) { @@ -1308,6 +1335,8 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser* dataset->metadata_.SetQueryAt(start_idx + i, static_cast(inner_data.second)); } } + if (inner_data.first == constraint_group_idx_) + dataset->metadata_.SetConstraintGroupAt(start_idx + i, static_cast(inner_data.second)); } if (dataset->has_raw()) { for (size_t j = 0; j < feature_row.size(); ++j) { diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 49fc834b8..d97ecca46 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -32,7 +32,8 @@ void Metadata::Init(const char* data_filename) { Metadata::~Metadata() { } -void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) { +void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx, int constraint_group_idx_) { + std::lock_guard lock(mutex_); num_data_ = num_data; label_ = std::vector(num_data_); if (weight_idx >= 0) { @@ -53,6 +54,8 @@ void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) { queries_ = std::vector(num_data_, 0); query_load_from_file_ = false; } + if (constraint_group_idx_ >= 0) + constraint_group_ = std::vector(num_data_, 0); } void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, data_size_t num_used_indices) { @@ -320,6 +323,24 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) { } } +void Metadata::SetConstraintGroup(const float* constraint_group, data_size_t len) { + std::lock_guard lock(mutex_); + if (constraint_group == nullptr) { + Log::Fatal("constraint_group cannot be nullptr"); + } + if (num_data_ != len) { + Log::Fatal("Length of constraint_group is not same as #data"); + } + if (constraint_group_.empty()) { + constraint_group_.resize(num_data_); + } + + #pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) + for (data_size_t i = 0; i < num_data_; ++i) { + constraint_group_[i] = static_cast(Common::AvoidInf(constraint_group[i])); + } +} + void Metadata::SetWeights(const label_t* weights, data_size_t len) { std::lock_guard lock(mutex_); // save to nullptr @@ -503,6 +524,8 @@ void Metadata::LoadFromMemory(const void* memory) { query_load_from_file_ = true; } LoadQueryWeights(); + + // TODO! load constraint_group_ information from memory ?? } void Metadata::SaveBinaryToFile(const VirtualFileWriter* writer) const { @@ -538,5 +561,4 @@ size_t Metadata::SizesInByte() const { return size; } - } // namespace LightGBM diff --git a/src/objective/constrained_recall_objective.hpp b/src/objective/constrained_recall_objective.hpp new file mode 100644 index 000000000..b15168f41 --- /dev/null +++ b/src/objective/constrained_recall_objective.hpp @@ -0,0 +1,212 @@ +/** + * The copyright of this file belongs to Feedzai. The file cannot be + * reproduced in whole or in part, stored in a retrieval system, + * transmitted in any form, or by any means electronic, mechanical, + * photocopying, or otherwise, without the prior permission of the owner. + * + * (c) 2021 Feedzai, Strictly Confidential + */ +/*! + * Constrained proxy recall objective (minimize proxy FNR). + */ + +#pragma clang diagnostic push +#pragma ide diagnostic ignored "openmp-use-default-none" + +#ifndef LIGHTGBM_CONSTRAINED_RECALL_OBJECTIVE_HPP +#define LIGHTGBM_CONSTRAINED_RECALL_OBJECTIVE_HPP + +#include +#include +#include +#include +#include "../metric/xentropy_metric.hpp" + +#include +#include + +/** + * Implements the proxy FNR loss (Recall as an objective). + * + * Minimizing FNR is equivalent to maximizing TPR (or Recall), as TPR = 1-FNR. + * Could use cross-entropy, quadratic, or hinge as proxy functions for FNR's step-wise function. + * > We need to use a differentiable proxy, as the step-wise function provides no gradient for optimization. + */ + +namespace LightGBM { + +class ConstrainedRecallObjective : public ConstrainedObjectiveFunction { +public: + explicit ConstrainedRecallObjective(const Config &config) + : deterministic_(config.deterministic) { + SetUpFromConfig(config); + + if (not this->IsGlobalFPRConstrained()) + throw std::invalid_argument("Must provide a global FPR constraint in order to optimize for Recall!"); + + if (objective_stepwise_proxy == "cross_entropy" or constraint_stepwise_proxy == "cross_entropy") { + if (proxy_margin_ < DBL_MIN) { + Log::Fatal("Proxy margin must be positive. It was %f.", proxy_margin_); + } + } + + if (objective_stepwise_proxy.empty()) { + Log::Fatal("Must provide an `objective_stepwise_proxy` to optimize for Recall. Got empty input."); + } + + // Disclaimer on using ConstrainedRecallObjective + Log::Warning("Directly optimizing for Recall is still being researched and is prone to high variability of outcomes."); + }; + + explicit ConstrainedRecallObjective(const std::vector &) + : deterministic_(false) { + throw std::invalid_argument( + "I don't think this constructor should ever be called; " + "it's only here for consistency with other objective functions."); + } + + ~ConstrainedRecallObjective() override = default; + + const char *GetName() const override { + return "constrained_recall_objective"; + } + + std::string ToString() const override { + return this->GetName(); + } + + /** + * Compute proxy FNR loss. + * + * Loss function: + * - Quadratic: l(a) = (1/2) * (a - margin_)^2 * I[a < margin_], where l(margin_) = 0 + * - BCE: l(a) = log( 1 + exp( -a + log(exp(margin_) - 1) ) ), where l(0) = margin_ + * - Hinge: l(a) = (margin_ - a) * I[a < margin_], where l(margin_) = 0 + * + * @param label The instance label. + * @param score The instance predicted score. + * @return The loss value. + */ + double ComputePredictiveLoss(label_t label, double score) const override { + // If label is zero, loss will be zero + if (abs(label) < 1e-5) // if (y_i == 0) + return 0.; + + if (objective_stepwise_proxy == "quadratic") + return score < proxy_margin_ ? (1./2.) * pow(score - proxy_margin_, 2) : 0.; // proxy_margin_ is the HORIZONTAL margin! + + else if (objective_stepwise_proxy == "cross_entropy") { + double xent_horizontal_shift = log(exp(proxy_margin_) - 1); // proxy_margin_ is the VERTICAL margin! + return log(1 + exp(-score + xent_horizontal_shift)); + } + + else if (objective_stepwise_proxy == "hinge") + return score < proxy_margin_ ? proxy_margin_ - score : 0.; // proxy_margin_ is the HORIZONTAL margin! + + else + throw std::invalid_argument("Invalid objective_stepwise_proxy=" + objective_stepwise_proxy); + } + + /*! + * The optimal constant-value model starts at logodds==0, as opposed to starting from the average score. + * This is due using a different objective function, plus using global constraints. + * @return 0 + */ + double BoostFromScore(int) const override { + Log::Info("constrained_recall_objective: boosting from scores == 0;"); + return 0.; + } + + /** + * > aka GetPredictiveLossGradientsWRTModelOutput + * + * Gradients of the proxy FNR loss w.r.t. the model output (scores). + * + * l(a) = (1/2) * (a - margin_)^2 * I[a < margin_] + * + * dl/da = (a - margin_) * I[a < margin_] + * + * @param score + * @param gradients + * @param hessians + */ + void GetGradients(const double *score, score_t *gradients, score_t *hessians) const override { + /** + * How much to shift the cross-entropy function (horizontally) to get + * the target proxy_margin_ at x=0; i.e., f(0) = proxy_margin_ + */ + const double xent_horizontal_shift = log(exp(proxy_margin_) - 1); + + /** + * NOTE + * - https://github.com/feedzai/fairgbm/issues/11 + * - This value should be zero in order to optimize solely for TPR (Recall), + * as TPR considers only label positives (LPs) and ignores label negatives (LNs). + * - However, initial splits will have -inf information gain if the gradients + * of all LNs are 0; + * - Hence, we're adding a tiny positive weight to the gradient of all LNs; + */ + const double label_negative_weight = 1e-2; + + #pragma omp parallel for schedule(static) + for (data_size_t i = 0; i < num_data_; ++i) { + + // Proxy FNR (or proxy Recall) has no loss for label negative samples (they're ignored). + if (abs(label_[i] - 1) < 1e-5) { // if (y_i == 1) + if (objective_stepwise_proxy == "quadratic") { + gradients[i] = (score_t) (score[i] < proxy_margin_ ? score[i] - proxy_margin_ : 0.); + hessians[i] = (score_t) (score[i] < proxy_margin_ ? 1. : 0.); + } + + else if (objective_stepwise_proxy == "cross_entropy") { + const double z = Constrained::sigmoid(score[i] - xent_horizontal_shift); + gradients[i] = (score_t) (z - 1.); + hessians[i] = (score_t) (z * (1. - z)); + } + + else if (objective_stepwise_proxy == "hinge") { + gradients[i] = (score_t) (score[i] < proxy_margin_ ? -1. : 0.); + hessians[i] = (score_t) 0.; + } + + else { + throw std::invalid_argument("Invalid objective proxy: " + objective_stepwise_proxy); + } + + if (weights_ != nullptr) { + gradients[i] *= weights_[i]; + hessians[i] *= weights_[i]; + } + + } else { + // NOTE: https://github.com/feedzai/fairgbm/issues/11 + // - This whole else clause should not be needed to optimize for Recall, + // as LNs have no influence on the FNR loss function or its (proxy-)gradient; + // - However, passing a zero gradient to all LNs leads to weird early stopping + // behavior from the `GBDT::Train` function; + // - Adding this tiny weight to the gradient of LNs seems to fix the issue with + // no (apparent) unintended consequences, as the gradient flowing is really small; + const double z = Constrained::sigmoid(score[i] + xent_horizontal_shift); + gradients[i] = (score_t) (label_negative_weight * z); + hessians[i] = (score_t) (label_negative_weight * z * (1. - z)); + } + } + } + + void GetConstraintGradientsWRTModelOutput(const double *multipliers, const double *score, score_t *gradients, + score_t *hessians) const override { + if (not this->IsGlobalFPRConstrained()) + throw std::invalid_argument("Recall objective function must have a global FPR constraint!"); + + ConstrainedObjectiveFunction::GetConstraintGradientsWRTModelOutput(multipliers, score, gradients, hessians); + } + +private: + const bool deterministic_; + +}; + + +} + +#endif //LIGHTGBM_CONSTRAINED_RECALL_OBJECTIVE_HPP diff --git a/src/objective/constrained_xentropy_objective.hpp b/src/objective/constrained_xentropy_objective.hpp new file mode 100644 index 000000000..499763c06 --- /dev/null +++ b/src/objective/constrained_xentropy_objective.hpp @@ -0,0 +1,157 @@ +/** + * The copyright of this file belongs to Feedzai. The file cannot be + * reproduced in whole or in part, stored in a retrieval system, + * transmitted in any form, or by any means electronic, mechanical, + * photocopying, or otherwise, without the prior permission of the owner. + * + * (c) 2021 Feedzai, Strictly Confidential + */ +/*! + * Copyright (c) 2017 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + +#pragma clang diagnostic push +#pragma ide diagnostic ignored "openmp-use-default-none" + +#ifndef LIGHTGBM_OBJECTIVE_CONSTRAINED_XENTROPY_OBJECTIVE_HPP_ +#define LIGHTGBM_OBJECTIVE_CONSTRAINED_XENTROPY_OBJECTIVE_HPP_ + +#include +#include +#include +#include +#include "../metric/xentropy_metric.hpp" + +#include +#include +#include +#include +#include + +namespace LightGBM { + +/** + * Objective function for constrained optimization. + * Uses the well-known Binary Cross Entropy (BCE) function for measuring predictive loss, plus + * Uses a cross-entropy-based function as a proxy for the step-wise function when computing fairness constraints. + * + * NOTE: + * - This `constrained_xentropy` objective generally leads to the best constrained results; + * - All results from the FairGBM paper use this objective function with the "cross_entropy" step-wise proxy; + * - This pairing of "constrained cross-entropy objective + cross-entropy proxy for constraints" was tested the most; + */ +class ConstrainedCrossEntropy : public ConstrainedObjectiveFunction { // TODO: inherit from both CrossEntropy and ConstrainedObjectiveFunction +public: + explicit ConstrainedCrossEntropy(const Config &config) + : deterministic_(config.deterministic) { + SetUpFromConfig(config); + + if (not objective_stepwise_proxy.empty()) { + Log::Warning("Ignoring argument objective_stepwise_proxy=%s.", objective_stepwise_proxy.c_str()); + } + } + + explicit ConstrainedCrossEntropy(const std::vector &) + : deterministic_(false) { + Log::Warning( + "The objective function 'constrained_cross_entropy' was not properly loaded. " + "Resuming training is not available; everything else can be used as usual." + ); // TODO: https://github.com/feedzai/fairgbm/issues/10 + } + + ~ConstrainedCrossEntropy() override = default; + + double ComputePredictiveLoss(label_t label, double score) const override { + return XentLoss(label, Constrained::sigmoid(score)); + } + + /** + * > aka GetPredictiveLossGradientsWRTModelOutput + * + * Gradient of the predictive loss w.r.t. model output (scores). + * This is a duplicate of the implementation in the CrossEntropy class. + * + * @param score Model outputs. + * @param gradients Reference to gradients' vector. + * @param hessians Reference to hessians' vector. + */ + void GetGradients(const double *score, score_t *gradients, score_t *hessians) const override { + if (weights_ == nullptr) { + // compute pointwise gradients and Hessians with implied unit weights + #pragma omp parallel for schedule(static) + for (data_size_t i = 0; i < num_data_; ++i) { + const double z = Constrained::sigmoid(score[i]); + + gradients[i] = static_cast(z - label_[i]); // 1st derivative + hessians[i] = static_cast(z * (1.0f - z)); // 2nd derivative + // NOTE: should we set the 2nd derivative to zero? to stick to a 1st order method in both descent and ascent steps. + } + } else { + // compute pointwise gradients and Hessians with given weights + #pragma omp parallel for schedule(static) + for (data_size_t i = 0; i < num_data_; ++i) { + const double z = Constrained::sigmoid(score[i]); + + gradients[i] = static_cast((z - label_[i]) * weights_[i]); + hessians[i] = static_cast(z * (1.0f - z) * weights_[i]); + } + } + } + + const char *GetName() const override { + return "constrained_cross_entropy"; + } + + std::string ToString() const override { + std::stringstream str_buf; + str_buf << GetName(); + /* str_buf << "_->constraint_type->" << constraint_type; + str_buf << "_->groups("; + for (auto &group: group_values_) + str_buf << group << ","; + str_buf << ")"; + + str_buf << "_score_threshold->" << score_threshold_; + str_buf << "_fpr_threshold->" << fpr_threshold_; + str_buf << "_fnr_threshold->" << fnr_threshold_; */ + return str_buf.str(); + } + + // implement custom average to boost from (if enabled among options) + double BoostFromScore(int) const override { + double suml = 0.0f; + double sumw = 0.0f; + if (weights_ != nullptr) { + + #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_) + for (data_size_t i = 0; i < num_data_; ++i) { + suml += label_[i] * weights_[i]; + sumw += weights_[i]; + } + } else { + sumw = static_cast(num_data_); + + #pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_) + for (data_size_t i = 0; i < num_data_; ++i) { + suml += label_[i]; + } + } + double pavg = sumw > 0.0f ? suml / sumw : 0.0f; + pavg = std::min(pavg, 1.0 - kEpsilon); + pavg = std::max(pavg, kEpsilon); + double initscore = std::log(pavg / (1.0f - pavg)); + Log::Info("[%s:%s]: pavg = %f -> initscore = %f", GetName(), __func__, pavg, initscore); + return initscore; + } + +private: + const bool deterministic_; + +}; + +} // end namespace LightGBM + +#endif // end #ifndef LIGHTGBM_OBJECTIVE_CONSTRAINED_XENTROPY_OBJECTIVE_HPP_ + +#pragma clang diagnostic pop \ No newline at end of file diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index 193353d93..dcfbd4901 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -9,6 +9,9 @@ #include "rank_objective.hpp" #include "regression_objective.hpp" #include "xentropy_objective.hpp" +#include "constrained_xentropy_objective.hpp" +#include "constrained_recall_objective.hpp" + namespace LightGBM { @@ -37,6 +40,10 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new MulticlassOVA(config); } else if (type == std::string("cross_entropy")) { return new CrossEntropy(config); + } else if (type == std::string("constrained_cross_entropy")) { // Entry-point for FairGBM code! + return new ConstrainedCrossEntropy(config); + } else if (type == std::string("constrained_recall_objective")) { // Entry-point for FairGBM code! + return new ConstrainedRecallObjective(config); } else if (type == std::string("cross_entropy_lambda")) { return new CrossEntropyLambda(config); } else if (type == std::string("mape")) { @@ -79,6 +86,10 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& return new MulticlassOVA(strs); } else if (type == std::string("cross_entropy")) { return new CrossEntropy(strs); + } else if (type == std::string("constrained_cross_entropy")) { + return new ConstrainedCrossEntropy(strs); + } else if (type == std::string("constrained_recall_objective")) { + return new ConstrainedRecallObjective(strs); } else if (type == std::string("cross_entropy_lambda")) { return new CrossEntropyLambda(strs); } else if (type == std::string("mape")) {