resolves #180

casact · Jul 25, 2021 · 468d733 · 468d733
1 parent d06f0ce
commit 468d733
Show file tree

Hide file tree

Showing 7 changed files with 278 additions and 27 deletions.
diff --git a/.gitignore b/.gitignore
@@ -117,3 +117,8 @@ settings.json
 *.DS_Store
 *Icon
 *r
+
+# asv environments
+.asv
+
+coverage_html_report
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
@@ -0,0 +1 @@
+
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
@@ -0,0 +1,160 @@
+{
+    // The version of the config file format.  Do not change, unless
+    // you know what you are doing.
+    "version": 1,
+
+    // The name of the project being benchmarked
+    "project": "chainladder-python",
+
+    // The project's homepage
+    "project_url": "https://chainladder-python.readthedocs.io/en/latest/",
+
+    // The URL or local path of the source code repository for the
+    // project being benchmarked
+    "repo": "..",
+
+    // The Python project's subdirectory in your repo.  If missing or
+    // the empty string, the project is assumed to be located at the root
+    // of the repository.
+    // "repo_subdir": "",
+
+    // Customizable commands for building, installing, and
+    // uninstalling the project. See asv.conf.json documentation.
+    //
+    // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
+    // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+    // "build_command": [
+    //     "python setup.py build",
+    //     "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
+    // ],
+
+    // List of branches to benchmark. If not provided, defaults to "master"
+    // (for git) or "default" (for mercurial).
+    // "branches": ["master"], // for git
+    // "branches": ["default"],    // for mercurial
+
+    // The DVCS being used.  If not set, it will be automatically
+    // determined from "repo" by looking at the protocol in the URL
+    // (if remote), or by looking for special directories, such as
+    // ".git" (if local).
+    // "dvcs": "git",
+
+    // The tool to use to create environments.  May be "conda",
+    // "virtualenv" or other value depending on the plugins in use.
+    // If missing or the empty string, the tool will be automatically
+    // determined by looking for tools on the PATH environment
+    // variable.
+    "environment_type": "conda",
+
+    // timeout in seconds for installing any dependencies in environment
+    // defaults to 10 min
+    //"install_timeout": 600,
+
+    // the base URL to show a commit for the project.
+    // "show_commit_url": "http://github.com/owner/project/commit/",
+
+    // The Pythons you'd like to test against.  If not provided, defaults
+    // to the current version of Python used to run `asv`.
+    // "pythons": ["3.6", "3.9"],
+
+    // The list of conda channel names to be searched for benchmark
+    // dependency packages in the specified order
+    // "conda_channels": ["conda-forge", "defaults"],
+
+    // The matrix of dependencies to test.  Each key is the name of a
+    // package (in PyPI) and the values are version numbers.  An empty
+    // list or empty string indicates to just test against the default
+    // (latest) version. null indicates that the package is to not be
+    // installed. If the package to be tested is only available from
+    // PyPi, and the 'environment_type' is conda, then you can preface
+    // the package name by 'pip+', and the package will be installed via
+    // pip (with all the conda available packages installed first,
+    // followed by the pip installed packages).
+    //
+    // "matrix": {
+    //     "numpy": ["1.6", "1.7"],
+    //     "six": ["", null],        // test with and without six installed
+    //     "pip+emcee": [""],   // emcee is only available for install with pip.
+    // },
+
+    // Combinations of libraries/python versions can be excluded/included
+    // from the set to test. Each entry is a dictionary containing additional
+    // key-value pairs to include/exclude.
+    //
+    // An exclude entry excludes entries where all values match. The
+    // values are regexps that should match the whole string.
+    //
+    // An include entry adds an environment. Only the packages listed
+    // are installed. The 'python' key is required. The exclude rules
+    // do not apply to includes.
+    //
+    // In addition to package names, the following keys are available:
+    //
+    // - python
+    //     Python version, as in the *pythons* variable above.
+    // - environment_type
+    //     Environment type, as above.
+    // - sys_platform
+    //     Platform, as in sys.platform. Possible values for the common
+    //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+    //
+    // "exclude": [
+    //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+    //     {"environment_type": "conda", "six": null}, // don't run without six on conda
+    // ],
+    //
+    // "include": [
+    //     // additional env for python2.7
+    //     {"python": "2.7", "numpy": "1.8"},
+    //     // additional env if run on windows+conda
+    //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
+    // ],
+
+    // The directory (relative to the current directory) that benchmarks are
+    // stored in.  If not provided, defaults to "benchmarks"
+    "benchmark_dir": ".",
+
+    // The directory (relative to the current directory) to cache the Python
+    // environments in.  If not provided, defaults to "env"
+    "env_dir": "../.asv/env",
+
+    // The directory (relative to the current directory) that raw benchmark
+    // results are stored in.  If not provided, defaults to "results".
+    "results_dir": "../.asv/results",
+
+    // The directory (relative to the current directory) that the html tree
+    // should be written to.  If not provided, defaults to "html".
+    "html_dir": "../.asv/html",
+
+    // The number of characters to retain in the commit hashes.
+    // "hash_length": 8,
+
+    // `asv` will cache results of the recent builds in each
+    // environment, making them faster to install next time.  This is
+    // the number of builds to keep, per environment.
+    // "build_cache_size": 2,
+
+    // The commits after which the regression search in `asv publish`
+    // should start looking for regressions. Dictionary whose keys are
+    // regexps matching to benchmark names, and values corresponding to
+    // the commit (exclusive) after which to start looking for
+    // regressions.  The default is to start from the first commit
+    // with results. If the commit is `null`, regression detection is
+    // skipped for the matching benchmark.
+    //
+    // "regressions_first_commits": {
+    //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+    //    "another_benchmark": null,   // Skip regression detection altogether
+    // },
+
+    // The thresholds for relative change in results, after which `asv
+    // publish` starts reporting regressions. Dictionary of the same
+    // form as in ``regressions_first_commits``, with values
+    // indicating the thresholds.  If multiple entries match, the
+    // maximum is taken. If no entry matches, the default is 5%.
+    //
+    // "regressions_thresholds": {
+    //    "some_benchmark": 0.01,     // Threshold of 1%
+    //    "another_benchmark": 0.5,   // Threshold of 50%
+    // },
+}
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
@@ -0,0 +1,57 @@
+# Write the benchmarking functions here.
+# See "Writing benchmarks" in the asv docs for more information.
+import chainladder as cl
+
+class TimeSuite:
+    def setup(self):
+        self.prism = cl.load_sample('prism')
+
+    def time_incr_to_cum(self):
+        self.prism.incr_to_cum()
+
+    def time_groupby(self):
+        self.prism.groupby(['Line']).sum()
+
+    def time_index_broadcasting(self):
+        self.prism / self.prism.groupby(['Line']).sum()
+
+    def time_grain(self):
+        self.prism.grain('OYDY')
+
+    def time_dev_to_val(self):
+        self.prism.dev_to_val()
+
+    def time_val_to_dev(self):
+        self.prism.dev_to_val().val_to_dev()
+
+    def time_fit_chainladder(self):
+        cl.Chainladder().fit(
+            cl.Development(groupby=lambda x : 1).fit_transform(self.prism['Paid'])
+        ).ibnr_
+
+class MemSuite:
+    def setup(self):
+        self.prism = cl.load_sample('prism')
+
+    def peakmem_incr_to_cum(self):
+        self.prism.incr_to_cum()
+
+    def peakmem_groupby(self):
+        self.prism.groupby(['Line']).sum()
+
+    def peakmem_index_broadcasting(self):
+        self.prism / self.prism.groupby(['Line']).sum()
+
+    def peakmem_grain(self):
+        self.prism.grain('OYDY')
+
+    def peakmem_dev_to_val(self):
+        self.prism.dev_to_val()
+
+    def peakmem_val_to_dev(self):
+        self.prism.dev_to_val().val_to_dev()
+
+    def peakmem_fit_chainladder(self):
+        cl.Chainladder().fit(
+            cl.Development(groupby=lambda x : 1).fit_transform(self.prism['Paid'])
+        ).ibnr_
diff --git a/chainladder/core/base.py b/chainladder/core/base.py
@@ -115,13 +115,25 @@ def __init__(
                 ' is expressed as an age where a date-like vector is required')
 
         # Summarize dataframe to the level specified in axes
-        data["__origin__"] = origin_date
-        data["__development__"] = development_date
-        key_gr = ["__origin__", "__development__"] + [
-            data[item] for item in ([] if not index else index)
-        ]
-        data_agg = data.groupby(key_gr)[columns].sum().reset_index().fillna(0)
-        data = data.drop(['__origin__', '__development__'], axis=1)
+        if type(data) != pd.DataFrame:
+            # Dask dataframes are mutated
+            data["__origin__"] = origin_date
+            data["__development__"] = development_date
+            key_gr = ["__origin__", "__development__"] + [
+                data[item] for item in ([] if not index else index)
+            ]
+            data_agg = data.groupby(key_gr)[columns].sum().reset_index().fillna(0)
+            data = data.drop(['__origin__', '__development__'], axis=1)
+        else:
+            # Summarize dataframe to the level specified in axes
+            key_gr = [origin_date, development_date] + [
+                data[item] for item in ([] if not index else index)
+            ]
+            data_agg = data[columns].groupby(key_gr).sum().reset_index().fillna(0)
+            data_agg["__origin__"] = data_agg[origin_date.name]
+            data_agg["__development__"] = data_agg[development_date.name]
+
+
         if not index:
             index = ["Total"]
             data_agg[index[0]] = "Total"

diff --git a/chainladder/core/dunders.py b/chainladder/core/dunders.py
@@ -8,6 +8,11 @@
 from chainladder.core.pandas import TriangleGroupBy
 from chainladder.utils.sparse import sp
 
+try:
+    import dask.bag as db
+except:
+    db = None
+
 class TriangleDunders:
     """ Class that implements the dunder (double underscore) methods for the
         Triangle class
@@ -200,13 +205,23 @@ def _get_key_union(obj, other):
         return set(list(obj.groups.indices.keys()) +
                    list(other.groups.indices.keys()))
 
+    def _arithmetic_mapper(self, obj, other, f):
+        """ Use Dask if available, otherwise basic list comprehension """
+        if db and obj.obj.array_backend == 'sparse':
+            bag = db.from_sequence(self._get_key_union(obj, other))
+            bag = bag.map(f, self, obj, other)
+            c = bag.compute(scheduler='threads')
+        else:
+            c = [f(k, self, obj, other) for k in self._get_key_union(obj, other)]
+        return concat(c, 0).sort_index()
+
     def __add__(self, other):
         obj, other = self._validate_arithmetic(other)
         if isinstance(obj, TriangleGroupBy):
-            c = [self._slice_or_nan(obj, other, k) +
-                 self._slice_or_nan(other, obj, k)
-                 for k in self._get_key_union(obj, other)]
-            obj = concat(c, 0).sort_index()
+            def f(k, self, obj, other):
+                return (self._slice_or_nan(obj, other, k) +
+                        self._slice_or_nan(other, obj, k))
+            obj = self._arithmetic_mapper(obj, other, f)
         else:
             xp = obj.get_array_module()
             obj.values = xp.nan_to_num(obj.values) + xp.nan_to_num(other)
@@ -218,10 +233,10 @@ def __radd__(self, other):
     def __sub__(self, other):
         obj, other = self._validate_arithmetic(other)
         if isinstance(obj, TriangleGroupBy):
-            c = [self._slice_or_nan(obj, other, k) -
-                 self._slice_or_nan(other, obj, k)
-                 for k in self._get_key_union(obj, other)]
-            obj = concat(c, 0).sort_index()
+            def f(k, self, obj, other):
+                return (self._slice_or_nan(obj, other, k) -
+                        self._slice_or_nan(other, obj, k))
+            obj = self._arithmetic_mapper(obj, other, f)
         else:
             xp = obj.get_array_module()
             obj.values = xp.nan_to_num(obj.values) - xp.nan_to_num(other)
@@ -252,10 +267,10 @@ def __abs__(self):
     def __mul__(self, other):
         obj, other = self._validate_arithmetic(other)
         if isinstance(obj, TriangleGroupBy):
-            c = [self._slice_or_nan(obj, other, k) *
-                 self._slice_or_nan(other, obj, k)
-                 for k in self._get_key_union(obj, other)]
-            obj = concat(c, 0).sort_index()
+            def f(k, self, obj, other):
+                return (self._slice_or_nan(obj, other, k) *
+                        self._slice_or_nan(other, obj, k))
+            obj = self._arithmetic_mapper(obj, other, f)
         else:
             xp = obj.get_array_module()
             obj.values = obj.values * other
@@ -267,10 +282,10 @@ def __rmul__(self, other):
     def __pow__(self, other):
         obj, other = self._validate_arithmetic(other)
         if isinstance(obj, TriangleGroupBy):
-            c = [self._slice_or_nan(obj, other, k) **
-                 self._slice_or_nan(other, obj, k)
-                for k in self._get_key_union(obj, other)]
-            obj = concat(c, 0).sort_index()
+            def f(k, self, obj, other):
+                return (self._slice_or_nan(obj, other, k) **
+                        self._slice_or_nan(other, obj, k))
+            obj = self._arithmetic_mapper(obj, other, f)
         else:
             xp = obj.get_array_module()
             obj.values = xp.nan_to_num(obj.values) ** other
@@ -285,10 +300,10 @@ def __round__(self, other):
     def __truediv__(self, other):
         obj, other = self._validate_arithmetic(other)
         if isinstance(obj, TriangleGroupBy):
-            c = [self._slice_or_nan(obj, other, k) /
-                 self._slice_or_nan(other, obj, k)
-                 for k in self._get_key_union(obj, other)]
-            obj = concat(c, 0).sort_index()
+            def f(k, self, obj, other):
+                return (self._slice_or_nan(obj, other, k) /
+                        self._slice_or_nan(other, obj, k))
+            obj = self._arithmetic_mapper(obj, other, f)
         else:
             xp = obj.get_array_module()
             obj.values = obj.values / other

diff --git a/environment-dev.yaml b/environment-dev.yaml
@@ -39,3 +39,4 @@ dependencies:
 
   - pip:
     - sphinx_gallery
+    - asv
Original file line number	Diff line number	Diff line change
Expand Up		@@ -39,3 +39,4 @@ dependencies:

		- pip:
		- sphinx_gallery
		- asv