From 26df0d5a5b95085a6178fca45f9ce988d5632968 Mon Sep 17 00:00:00 2001 From: Nicholas Smith Date: Tue, 16 Feb 2021 17:30:15 -0600 Subject: [PATCH] Add overflow behavior handling in binning nodes (#37) * Add overflow behavior handling in binning nodes Closes #5 * Add binning tests And fix a bug! std::lower_bound gives edges[i] < x <= edges[i+1] std::upper_bound gives edges[i] <= x < edges[i+1] which is what we specified * Update rendered schemas Need to add this to pre-commit or something --- data/conversion.py | 5 ++ data/schemav2.json | 85 +++++++++++++++++++++- include/correction.h | 7 ++ src/correction.cc | 67 +++++++++++++++--- src/correctionlib/convert.py | 2 + src/correctionlib/schemav2.py | 4 ++ tests/test_core.py | 128 ++++++++++++++++++++++++++++++++-- 7 files changed, 281 insertions(+), 17 deletions(-) diff --git a/data/conversion.py b/data/conversion.py index d4ffd9f3..f7eb810c 100755 --- a/data/conversion.py +++ b/data/conversion.py @@ -51,6 +51,7 @@ def build_discrbinning(sf): build_formula(sf[(sf["discrMin"] >= lo) & (sf["discrMax"] <= hi)]) for lo, hi in zip(edges[:-1], edges[1:]) ], + "flow": "clamp", } ) @@ -66,6 +67,7 @@ def build_ptbinning(sf): build_discrbinning(sf[(sf["ptMin"] >= lo) & (sf["ptMax"] <= hi)]) for lo, hi in zip(edges[:-1], edges[1:]) ], + "flow": "clamp", } ) @@ -81,6 +83,7 @@ def build_etabinning(sf): build_ptbinning(sf[(sf["etaMin"] >= lo) & (sf["etaMax"] <= hi)]) for lo, hi in zip(edges[:-1], edges[1:]) ], + "flow": "error", } ) @@ -183,6 +186,7 @@ def build_pts(sf): "input": "pt", "edges": edges, "content": content, + "flow": "clamp", } ) @@ -207,6 +211,7 @@ def build_etas(sf): "input": "eta", "edges": edges, "content": content, + "flow": "error", } ) diff --git a/data/schemav2.json b/data/schemav2.json index ac6eaa6f..e0e700ea 100644 --- a/data/schemav2.json +++ b/data/schemav2.json @@ -149,6 +149,10 @@ "const": "category", "type": "string" }, + "input": { + "title": "Input", + "type": "string" + }, "content": { "title": "Content", "type": "array", @@ -159,6 +163,7 @@ }, "required": [ "nodetype", + "input", "content" ], "additionalProperties": false @@ -173,6 +178,13 @@ "const": "multibinning", "type": "string" }, + "inputs": { + "title": "Inputs", + "type": "array", + "items": { + "type": "string" + } + }, "edges": { "title": "Edges", "type": "array", @@ -205,12 +217,47 @@ } ] } + }, + "flow": { + "title": "Flow", + "anyOf": [ + { + "$ref": "#/definitions/Binning" + }, + { + "$ref": "#/definitions/MultiBinning" + }, + { + "$ref": "#/definitions/Category" + }, + { + "$ref": "#/definitions/Formula" + }, + { + "type": "number" + }, + { + "title": "Flow Literal['Clamp', 'Error']", + "anyOf": [ + { + "const": "clamp", + "type": "string" + }, + { + "const": "error", + "type": "string" + } + ] + } + ] } }, "required": [ "nodetype", + "inputs", "edges", - "content" + "content", + "flow" ], "additionalProperties": false }, @@ -256,13 +303,47 @@ } ] } + }, + "flow": { + "title": "Flow", + "anyOf": [ + { + "$ref": "#/definitions/Binning" + }, + { + "$ref": "#/definitions/MultiBinning" + }, + { + "$ref": "#/definitions/Category" + }, + { + "$ref": "#/definitions/Formula" + }, + { + "type": "number" + }, + { + "title": "Flow Literal['Clamp', 'Error']", + "anyOf": [ + { + "const": "clamp", + "type": "string" + }, + { + "const": "error", + "type": "string" + } + ] + } + ] } }, "required": [ "nodetype", "input", "edges", - "content" + "content", + "flow" ], "additionalProperties": false }, diff --git a/include/correction.h b/include/correction.h index 4c497f1b..e31a30cb 100644 --- a/include/correction.h +++ b/include/correction.h @@ -85,6 +85,9 @@ class Formula { double eval_ast(const Ast& ast, const std::vector& variables) const; }; +// common internal for Binning and MultiBinning +enum class _FlowBehavior {value, clamp, error}; + class Binning { public: Binning(const rapidjson::Value& json, const std::vector& inputs); @@ -93,6 +96,8 @@ class Binning { private: std::vector> bins_; size_t variableIdx_; + _FlowBehavior flow_; + std::unique_ptr default_value_; }; class MultiBinning { @@ -105,6 +110,8 @@ class MultiBinning { // variableIdx, stride, edges std::vector>> axes_; std::vector content_; + _FlowBehavior flow_; + std::unique_ptr default_value_; }; class Category { diff --git a/src/correction.cc b/src/correction.cc index 8cc5156c..c2ff9a91 100644 --- a/src/correction.cc +++ b/src/correction.cc @@ -325,23 +325,48 @@ Binning::Binning(const rapidjson::Value& json, const std::vector& inpu throw std::runtime_error("Inconsistency in Binning: number of content nodes does not match binning"); } bins_.reserve(edges.size()); - // first bin is a dummy content node (represets lower_bound returning underflow) - // TODO: good spot to put overflow default behavior + // first bin is a dummy content node (represents upper_bound returning underflow) bins_.push_back({*edges.begin(), 0.}); for (size_t i=0; i < content.Size(); ++i) { bins_.push_back({edges[i + 1], resolve_content(content[i], inputs)}); } variableIdx_ = find_variable_index(json["input"], inputs); + if ( json["flow"] == "clamp" ) { + flow_ = _FlowBehavior::clamp; + } + else if ( json["flow"] == "error" ) { + flow_ = _FlowBehavior::error; + } + else { // Content node + flow_ = _FlowBehavior::value; + default_value_ = std::make_unique(resolve_content(json["flow"], inputs)); + } } const Content& Binning::child(const std::vector& values) const { double value = std::get(values[variableIdx_]); - auto it = std::lower_bound(std::begin(bins_), std::end(bins_), value, [](const auto& a, auto b) { return std::get<0>(a) < b; }); + auto it = std::upper_bound(std::begin(bins_), std::end(bins_), value, [](const double& a, const auto& b) { return a < std::get<0>(b); }); if ( it == std::begin(bins_) ) { - throw std::runtime_error("Index below bounds in Binning for input " + std::to_string(variableIdx_) + " value: " + std::to_string(value)); + if ( flow_ == _FlowBehavior::value ) { + return *default_value_; + } + else if ( flow_ == _FlowBehavior::error ) { + throw std::runtime_error("Index below bounds in Binning for input " + std::to_string(variableIdx_) + " value: " + std::to_string(value)); + } + else { // clamp + it++; + } } else if ( it == std::end(bins_) ) { - throw std::runtime_error("Index above bounds in Binning for input " + std::to_string(variableIdx_) + " value: " + std::to_string(value)); + if ( flow_ == _FlowBehavior::value ) { + return *default_value_; + } + else if ( flow_ == _FlowBehavior::error ) { + throw std::runtime_error("Index above bounds in Binning for input " + std::to_string(variableIdx_) + " value: " + std::to_string(value)); + } + else { // clamp + it--; + } } return std::get<1>(*it); } @@ -373,18 +398,44 @@ MultiBinning::MultiBinning(const rapidjson::Value& json, const std::vector(resolve_content(json["flow"], inputs)); + } } const Content& MultiBinning::child(const std::vector& values) const { size_t idx {0}; for (const auto& [variableIdx, stride, edges] : axes_) { double value = std::get(values[variableIdx]); - auto it = std::lower_bound(std::begin(edges), std::end(edges), value); + auto it = std::upper_bound(std::begin(edges), std::end(edges), value); if ( it == std::begin(edges) ) { - throw std::runtime_error("Index below bounds in MultiBinning for input " + std::to_string(variableIdx) + " val: " + std::to_string(value)); + if ( flow_ == _FlowBehavior::value ) { + return *default_value_; + } + else if ( flow_ == _FlowBehavior::error ) { + throw std::runtime_error("Index below bounds in MultiBinning for input " + std::to_string(variableIdx) + " val: " + std::to_string(value)); + } + else { // clamp + it++; + } } else if ( it == std::end(edges) ) { - throw std::runtime_error("Index above bounds in MultiBinning input " + std::to_string(variableIdx) + " val: " + std::to_string(value)); + if ( flow_ == _FlowBehavior::value ) { + return *default_value_; + } + else if ( flow_ == _FlowBehavior::error ) { + throw std::runtime_error("Index above bounds in MultiBinning input " + std::to_string(variableIdx) + " val: " + std::to_string(value)); + } + else { // clamp + it--; + } } size_t localidx = std::distance(std::begin(edges), it) - 1; idx += localidx * stride; diff --git a/src/correctionlib/convert.py b/src/correctionlib/convert.py index 13b95a38..5ffb6558 100644 --- a/src/correctionlib/convert.py +++ b/src/correctionlib/convert.py @@ -90,6 +90,7 @@ def build_data( else build_data(value, axes[i:], variables[i:]) for value in flatten_to(values, i - 1) ], + "flow": "error", # TODO: can also produce overflow guard bins and clamp } ) return Binning.parse_obj( @@ -103,6 +104,7 @@ def build_data( else build_data(value, axes[1:], variables[1:]) for value in values ], + "flow": "error", # TODO: can also produce overflow guard bins and clamp } ) diff --git a/src/correctionlib/schemav2.py b/src/correctionlib/schemav2.py index 429eb7b8..dda852f1 100644 --- a/src/correctionlib/schemav2.py +++ b/src/correctionlib/schemav2.py @@ -42,6 +42,8 @@ class Binning(Model): edges: List[float] "Edges of the binning, where edges[i] <= x < edges[i+1] => f(x, ...) = content[i](...)" content: List[Content] + flow: Union[Content, Literal["clamp", "error"]] + "Overflow behavior for out-of-bounds values" class MultiBinning(Model): @@ -56,6 +58,8 @@ class MultiBinning(Model): to the element at i0 in dimension 0, i1 in dimension 1, etc. and d0 = len(edges[0]), etc. """ content: List[Content] + flow: Union[Content, Literal["clamp", "error"]] + "Overflow behavior for out-of-bounds values" class CategoryItem(Model): diff --git a/tests/test_core.py b/tests/test_core.py index 3d5dfbb4..286699ec 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -7,6 +7,14 @@ from correctionlib import schemav2 as schema +def wrap(*corrs): + cset = schema.CorrectionSet( + schema_version=2, + corrections=list(corrs), + ) + return core.CorrectionSet.from_string(cset.json()) + + def test_evaluator_v1(): with pytest.raises(RuntimeError): cset = core.CorrectionSet.from_string("{") @@ -17,13 +25,6 @@ def test_evaluator_v1(): with pytest.raises(RuntimeError): cset = core.CorrectionSet.from_string('{"schema_version": "blah"}') - def wrap(*corrs): - cset = schema.CorrectionSet( - schema_version=2, - corrections=list(corrs), - ) - return core.CorrectionSet.from_string(cset.json()) - cset = wrap( schema.Correction( name="test corr", @@ -57,6 +58,7 @@ def wrap(*corrs): "nodetype": "binning", "input": "pt", "edges": [0, 20, 40], + "flow": "error", "content": [ schema.Category.parse_obj( { @@ -161,3 +163,115 @@ def test_tformula(): for i, (_, expected) in enumerate(formulas): for x in test_values: assert corr.evaluate(i, x) == expected(x) + + +def test_binning(): + def binning(flow): + cset = wrap( + schema.Correction( + name="test", + version=2, + inputs=[schema.Variable(name="x", type="real")], + output=schema.Variable(name="a scale", type="real"), + data=schema.Binning( + nodetype="binning", + input="x", + edges=[0.0, 1.0, 3.0], + content=[1.0, 2.0], + flow=flow, + ), + ) + ) + return cset["test"] + + corr = binning(flow="error") + with pytest.raises(RuntimeError): + corr.evaluate(-1.0) + assert corr.evaluate(0.0) == 1.0 + assert corr.evaluate(0.2) == 1.0 + assert corr.evaluate(1.0) == 2.0 + with pytest.raises(RuntimeError): + corr.evaluate(3.0) + + corr = binning(flow="clamp") + assert corr.evaluate(-1.0) == 1.0 + assert corr.evaluate(1.0) == 2.0 + assert corr.evaluate(3.0) == 2.0 + assert corr.evaluate(3000.0) == 2.0 + + corr = binning(flow=42.0) + assert corr.evaluate(-1.0) == 42.0 + assert corr.evaluate(0.0) == 1.0 + assert corr.evaluate(1.0) == 2.0 + assert corr.evaluate(2.9) == 2.0 + assert corr.evaluate(3.0) == 42.0 + + def multibinning(flow): + cset = wrap( + schema.Correction( + name="test", + version=2, + inputs=[ + schema.Variable(name="x", type="real"), + schema.Variable(name="y", type="real"), + ], + output=schema.Variable(name="a scale", type="real"), + data=schema.MultiBinning( + nodetype="multibinning", + inputs=["x", "y"], + edges=[ + [0.0, 1.0, 3.0], + [10.0, 20.0, 30.0, 40.0], + ], + content=[float(i) for i in range(2 * 3)], + flow=flow, + ), + ) + ) + return cset["test"] + + corr = multibinning(flow="error") + with pytest.raises(RuntimeError): + corr.evaluate(0.0, 5.0) + with pytest.raises(RuntimeError): + corr.evaluate(-1.0, 5.0) + assert corr.evaluate(0.0, 10.0) == 0.0 + assert corr.evaluate(0.0, 20.0) == 1.0 + assert corr.evaluate(0.0, 30.0) == 2.0 + with pytest.raises(RuntimeError): + corr.evaluate(0.0, 40.0) + assert corr.evaluate(1.0, 10.0) == 3.0 + assert corr.evaluate(1.0, 20.0) == 4.0 + assert corr.evaluate(1.0, 30.0) == 5.0 + with pytest.raises(RuntimeError): + corr.evaluate(2.0, 5.0) + + corr = multibinning(flow="clamp") + assert corr.evaluate(-1.0, 5.0) == 0.0 + assert corr.evaluate(-1.0, 25.0) == 1.0 + assert corr.evaluate(-1.0, 35.0) == 2.0 + assert corr.evaluate(-1.0, 45.0) == 2.0 + assert corr.evaluate(0.0, 45.0) == 2.0 + assert corr.evaluate(2.0, 45.0) == 5.0 + assert corr.evaluate(3.0, 45.0) == 5.0 + assert corr.evaluate(3.0, 35.0) == 5.0 + assert corr.evaluate(3.0, 25.0) == 4.0 + assert corr.evaluate(3.0, 15.0) == 3.0 + assert corr.evaluate(3.0, 5.0) == 3.0 + assert corr.evaluate(0.0, 5.0) == 0.0 + + corr = multibinning(flow=42.0) + assert corr.evaluate(-1.0, 5.0) == 42.0 + assert corr.evaluate(2.0, 45.0) == 42.0 + assert corr.evaluate(3.0, 5.0) == 42.0 + + corr = multibinning( + flow=schema.Formula( + nodetype="formula", + expression="2.*x + 5.*y", + parser="TFormula", + variables=["x", "y"], + ) + ) + assert corr.evaluate(-1.0, 5.0) == 2.0 * -1 + 5.0 * 5.0 + assert corr.evaluate(0.0, 10.0) == 0.0