diff --git a/.gitignore b/.gitignore index 2f182d65..4e73d660 100644 --- a/.gitignore +++ b/.gitignore @@ -117,3 +117,8 @@ settings.json *.DS_Store *Icon *r + +# asv environments +.asv + +coverage_html_report diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/benchmarks/__init__.py @@ -0,0 +1 @@ + diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 00000000..17518034 --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,160 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "chainladder-python", + + // The project's homepage + "project_url": "https://chainladder-python.readthedocs.io/en/latest/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + // "build_command": [ + // "python setup.py build", + // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + // ], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + // "branches": ["master"], // for git + // "branches": ["default"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + // "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + // "show_commit_url": "http://github.com/owner/project/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["3.6", "3.9"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + // "conda_channels": ["conda-forge", "defaults"], + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list or empty string indicates to just test against the default + // (latest) version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed via + // pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + // "matrix": { + // "numpy": ["1.6", "1.7"], + // "six": ["", null], // test with and without six installed + // "pip+emcee": [""], // emcee is only available for install with pip. + // }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "six": null}, // don't run without six on conda + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "numpy": "1.8"}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + "benchmark_dir": ".", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": "../.asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": "../.asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": "../.asv/html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + // "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, +} diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py new file mode 100644 index 00000000..cec69f54 --- /dev/null +++ b/benchmarks/benchmarks.py @@ -0,0 +1,57 @@ +# Write the benchmarking functions here. +# See "Writing benchmarks" in the asv docs for more information. +import chainladder as cl + +class TimeSuite: + def setup(self): + self.prism = cl.load_sample('prism') + + def time_incr_to_cum(self): + self.prism.incr_to_cum() + + def time_groupby(self): + self.prism.groupby(['Line']).sum() + + def time_index_broadcasting(self): + self.prism / self.prism.groupby(['Line']).sum() + + def time_grain(self): + self.prism.grain('OYDY') + + def time_dev_to_val(self): + self.prism.dev_to_val() + + def time_val_to_dev(self): + self.prism.dev_to_val().val_to_dev() + + def time_fit_chainladder(self): + cl.Chainladder().fit( + cl.Development(groupby=lambda x : 1).fit_transform(self.prism['Paid']) + ).ibnr_ + +class MemSuite: + def setup(self): + self.prism = cl.load_sample('prism') + + def peakmem_incr_to_cum(self): + self.prism.incr_to_cum() + + def peakmem_groupby(self): + self.prism.groupby(['Line']).sum() + + def peakmem_index_broadcasting(self): + self.prism / self.prism.groupby(['Line']).sum() + + def peakmem_grain(self): + self.prism.grain('OYDY') + + def peakmem_dev_to_val(self): + self.prism.dev_to_val() + + def peakmem_val_to_dev(self): + self.prism.dev_to_val().val_to_dev() + + def peakmem_fit_chainladder(self): + cl.Chainladder().fit( + cl.Development(groupby=lambda x : 1).fit_transform(self.prism['Paid']) + ).ibnr_ diff --git a/chainladder/core/base.py b/chainladder/core/base.py index ac27e771..ff20fde4 100644 --- a/chainladder/core/base.py +++ b/chainladder/core/base.py @@ -115,13 +115,25 @@ def __init__( ' is expressed as an age where a date-like vector is required') # Summarize dataframe to the level specified in axes - data["__origin__"] = origin_date - data["__development__"] = development_date - key_gr = ["__origin__", "__development__"] + [ - data[item] for item in ([] if not index else index) - ] - data_agg = data.groupby(key_gr)[columns].sum().reset_index().fillna(0) - data = data.drop(['__origin__', '__development__'], axis=1) + if type(data) != pd.DataFrame: + # Dask dataframes are mutated + data["__origin__"] = origin_date + data["__development__"] = development_date + key_gr = ["__origin__", "__development__"] + [ + data[item] for item in ([] if not index else index) + ] + data_agg = data.groupby(key_gr)[columns].sum().reset_index().fillna(0) + data = data.drop(['__origin__', '__development__'], axis=1) + else: + # Summarize dataframe to the level specified in axes + key_gr = [origin_date, development_date] + [ + data[item] for item in ([] if not index else index) + ] + data_agg = data[columns].groupby(key_gr).sum().reset_index().fillna(0) + data_agg["__origin__"] = data_agg[origin_date.name] + data_agg["__development__"] = data_agg[development_date.name] + + if not index: index = ["Total"] data_agg[index[0]] = "Total" diff --git a/chainladder/core/dunders.py b/chainladder/core/dunders.py index e6c9e335..11f8601e 100644 --- a/chainladder/core/dunders.py +++ b/chainladder/core/dunders.py @@ -8,6 +8,11 @@ from chainladder.core.pandas import TriangleGroupBy from chainladder.utils.sparse import sp +try: + import dask.bag as db +except: + db = None + class TriangleDunders: """ Class that implements the dunder (double underscore) methods for the Triangle class @@ -200,13 +205,23 @@ def _get_key_union(obj, other): return set(list(obj.groups.indices.keys()) + list(other.groups.indices.keys())) + def _arithmetic_mapper(self, obj, other, f): + """ Use Dask if available, otherwise basic list comprehension """ + if db and obj.obj.array_backend == 'sparse': + bag = db.from_sequence(self._get_key_union(obj, other)) + bag = bag.map(f, self, obj, other) + c = bag.compute(scheduler='threads') + else: + c = [f(k, self, obj, other) for k in self._get_key_union(obj, other)] + return concat(c, 0).sort_index() + def __add__(self, other): obj, other = self._validate_arithmetic(other) if isinstance(obj, TriangleGroupBy): - c = [self._slice_or_nan(obj, other, k) + - self._slice_or_nan(other, obj, k) - for k in self._get_key_union(obj, other)] - obj = concat(c, 0).sort_index() + def f(k, self, obj, other): + return (self._slice_or_nan(obj, other, k) + + self._slice_or_nan(other, obj, k)) + obj = self._arithmetic_mapper(obj, other, f) else: xp = obj.get_array_module() obj.values = xp.nan_to_num(obj.values) + xp.nan_to_num(other) @@ -218,10 +233,10 @@ def __radd__(self, other): def __sub__(self, other): obj, other = self._validate_arithmetic(other) if isinstance(obj, TriangleGroupBy): - c = [self._slice_or_nan(obj, other, k) - - self._slice_or_nan(other, obj, k) - for k in self._get_key_union(obj, other)] - obj = concat(c, 0).sort_index() + def f(k, self, obj, other): + return (self._slice_or_nan(obj, other, k) - + self._slice_or_nan(other, obj, k)) + obj = self._arithmetic_mapper(obj, other, f) else: xp = obj.get_array_module() obj.values = xp.nan_to_num(obj.values) - xp.nan_to_num(other) @@ -252,10 +267,10 @@ def __abs__(self): def __mul__(self, other): obj, other = self._validate_arithmetic(other) if isinstance(obj, TriangleGroupBy): - c = [self._slice_or_nan(obj, other, k) * - self._slice_or_nan(other, obj, k) - for k in self._get_key_union(obj, other)] - obj = concat(c, 0).sort_index() + def f(k, self, obj, other): + return (self._slice_or_nan(obj, other, k) * + self._slice_or_nan(other, obj, k)) + obj = self._arithmetic_mapper(obj, other, f) else: xp = obj.get_array_module() obj.values = obj.values * other @@ -267,10 +282,10 @@ def __rmul__(self, other): def __pow__(self, other): obj, other = self._validate_arithmetic(other) if isinstance(obj, TriangleGroupBy): - c = [self._slice_or_nan(obj, other, k) ** - self._slice_or_nan(other, obj, k) - for k in self._get_key_union(obj, other)] - obj = concat(c, 0).sort_index() + def f(k, self, obj, other): + return (self._slice_or_nan(obj, other, k) ** + self._slice_or_nan(other, obj, k)) + obj = self._arithmetic_mapper(obj, other, f) else: xp = obj.get_array_module() obj.values = xp.nan_to_num(obj.values) ** other @@ -285,10 +300,10 @@ def __round__(self, other): def __truediv__(self, other): obj, other = self._validate_arithmetic(other) if isinstance(obj, TriangleGroupBy): - c = [self._slice_or_nan(obj, other, k) / - self._slice_or_nan(other, obj, k) - for k in self._get_key_union(obj, other)] - obj = concat(c, 0).sort_index() + def f(k, self, obj, other): + return (self._slice_or_nan(obj, other, k) / + self._slice_or_nan(other, obj, k)) + obj = self._arithmetic_mapper(obj, other, f) else: xp = obj.get_array_module() obj.values = obj.values / other diff --git a/environment-dev.yaml b/environment-dev.yaml index ad6a3c01..19e1663b 100644 --- a/environment-dev.yaml +++ b/environment-dev.yaml @@ -39,3 +39,4 @@ dependencies: - pip: - sphinx_gallery + - asv