Merge pull request #39 from jbogaardt/development-options

Development options
casact · Mar 12, 2019 · 7df5952 · 7df5952
2 parents 5049d9c + 8da0f33
commit 7df5952
Show file tree

Hide file tree

Showing 18 changed files with 167 additions and 27 deletions.
diff --git a/chainladder/core/base.py b/chainladder/core/base.py
@@ -103,12 +103,15 @@ def columns(self, value):
 
     @property
     def origin(self):
-        return pd.DatetimeIndex(self.odims, name='origin')
+        return pd.DatetimeIndex(self.odims, name='origin') \
+                 .to_period(self.origin_grain)
 
     @origin.setter
     def origin(self, value):
         self._len_check(self.origin, value)
-        self.odims = pd.Series([value] if type(value) is str else value).values
+        value = pd.PeriodIndex([item for item in list(value)],
+                               freq=self.origin_grain).to_timestamp()
+        self.odims = value.values
 
     @property
     def development(self):
@@ -128,7 +131,7 @@ def link_ratio(self):
         obj = copy.deepcopy(self)
         temp = obj.values.copy()
         temp[temp == 0] = np.nan
-        val_array = obj.valuation.values.reshape(obj.shape[-2:],
+        val_array = obj.valuation.to_timestamp().values.reshape(obj.shape[-2:],
                                                  order='f')[:, 1:]
         obj.values = temp[..., 1:]/temp[..., :-1]
         obj.ddims = np.array(['{}-{}'.format(obj.ddims[i], obj.ddims[i+1])
@@ -139,7 +142,7 @@ def link_ratio(self):
             obj.odims = obj.odims[:-1]
             val_array = val_array[:-1, :]
         obj.valuation = pd.DatetimeIndex(
-            pd.DataFrame(val_array).unstack().values)
+            pd.DataFrame(val_array).unstack().values).to_period(self._lowest_grain())
         return obj
 
     @property
@@ -159,7 +162,7 @@ def get_latest_diagonal(self, compress=True):
             diagonal = np.expand_dims(np.nansum(diagonal, 3), 3)
             obj.ddims = ['Latest']
             obj.valuation = pd.DatetimeIndex(
-                [pd.to_datetime(obj.valuation_date)]*len(obj.odims))
+                [pd.to_datetime(obj.valuation_date)]*len(obj.odims)).to_period(self._lowest_grain())
         obj.values = diagonal
         return obj
 
@@ -252,7 +255,9 @@ def grain(self, grain='', incremental=False, inplace=False):
             self.values = self._slide(new_tri, direction='l')
             self.values[self.values == 0] = np.nan
             self.valuation = self._valuation_triangle()
-            del self._nan_triangle
+            if hasattr(self, '_nan_triangle'):
+                # Force update on _nan_triangle at next access.
+                del self._nan_triangle
             if incremental:
                 self.cum_to_incr(inplace=True)
             return self
@@ -283,7 +288,7 @@ def trend(self, trend=0.0):
                   .value/365.25)
         else:
             trend = (1 + trend)**-(
-                pd.Series(self.valuation.values -
+                pd.Series(self.valuation.to_timestamp().values -
                           np.datetime64(self.valuation_date)).dt.days
                   .values.reshape(self.shape[-2:], order='f')/365.25)
         obj = copy.deepcopy(self)
@@ -433,6 +438,8 @@ def _validate_arithmetic(self, other):
                 ddims = set(self.ddims).intersection(set(other.ddims))
                 odims = set(self.odims).intersection(set(other.odims))
                 # Need to set string vs int type-casting
+                odims = pd.PeriodIndex(np.array(list(odims)),
+                                       freq=self.origin_grain)
                 obj = obj[obj.origin.isin(odims)][obj.development.isin(ddims)]
                 other = other[other.origin.isin(odims)][other.development.isin(ddims)]
                 obj.odims = np.sort(np.array(list(odims)))
@@ -712,7 +719,7 @@ def nan_triangle(self):
            hasattr(self, '_nan_triangle'):
             self.valuation = self._valuation_triangle()
             val_array = self.valuation
-            val_array = val_array.values.reshape(self.shape[-2:], order='f')
+            val_array = val_array.to_timestamp().values.reshape(self.shape[-2:], order='f')
             nan_triangle = np.array(
                 pd.DataFrame(val_array) > self.valuation_date)
             nan_triangle = np.where(nan_triangle, np.nan, 1)
@@ -736,6 +743,8 @@ def _valuation_triangle(self, ddims=None):
         origin = pd.PeriodIndex(self.odims, freq=self.origin_grain) \
                    .to_timestamp(how='s')
         origin = pd.Series(origin)
+        if type(self.valuation_date) is not pd.Timestamp:
+            self.valuation_date = self.valuation_date.to_timestamp()
         # Limit origin to valuation date
         origin[origin > self.valuation_date] = self.valuation_date
         next_development = origin+pd.DateOffset(days=-1, months=ddims[0])
@@ -750,7 +759,16 @@ def _valuation_triangle(self, ddims=None):
                 next_development = np.expand_dims(
                     np.array(origin+pd.DateOffset(days=-1, months=item)), -1)
             val_array = np.concatenate((val_array, next_development), -1)
-        return pd.DatetimeIndex(pd.DataFrame(val_array).unstack().values)
+        val_array = pd.DatetimeIndex(pd.DataFrame(val_array).unstack().values)
+        return val_array.to_period(self._lowest_grain())
+
+    def _lowest_grain(self):
+        my_list = ['M', 'Q', 'Y']
+        my_dict = {item: num for num, item in enumerate(my_list)}
+        lowest_grain = my_list[min(my_dict[self.origin_grain],
+                                   my_dict[self.development_grain])]
+        return lowest_grain
+
 
     def _slide(self, triangle, direction='r'):
         ''' Facilitates swapping alignment of triangle between development
@@ -803,6 +821,7 @@ def to_datetime(data, fields, period_end=False):
         target = target_field.map(arr)
         if period_end:
             target = TriangleBase._period_end(target)
+        target.name = 'valuation'
         return target
 
     @staticmethod

diff --git a/chainladder/development/base.py b/chainladder/development/base.py
@@ -3,6 +3,7 @@
 ================
 """
 import numpy as np
+import pandas as pd
 import copy
 import warnings
 from sklearn.base import BaseEstimator
@@ -52,6 +53,15 @@ class Development(DevelopmentBase):
         'volume', 'simple', and 'regression'
     sigma_interpolation : string optional (default='log-linear')
         Options include 'log-linear' and 'mack'
+    drop : tuple or list of tuples
+        Drops specific origin/development combination(s)
+    drop_high : bool or list of bool (default=None)
+        Drops highest link ratio(s) from LDF calculation
+    drop_low : bool or list of bool (default=None)
+        Drops lowest link ratio(s) from LDF calculation
+    drop_valuation : str or list of str (default=None)
+        Drops specific valuation periods. str must be date convertible.
+
 
     Attributes
     ----------
@@ -68,10 +78,15 @@ class Development(DevelopmentBase):
 
     """
     def __init__(self, n_periods=-1, average='volume',
-                 sigma_interpolation='log-linear'):
+                 sigma_interpolation='log-linear', drop=None,
+                 drop_high=None, drop_low=None, drop_valuation=None):
         self.n_periods = n_periods
         self.average = average
         self.sigma_interpolation = sigma_interpolation
+        self.drop_high = drop_high
+        self.drop_low = drop_low
+        self.drop_valuation = drop_valuation
+        self.drop = drop
 
     def _assign_n_periods_weight(self, X):
         if type(self.n_periods) is int:
@@ -105,6 +120,68 @@ def _assign_n_periods_weight_int(self, X, n_periods):
                                 np.ones((k, v, n_periods+1, d))), 2)*flip_nan
             return w*X.expand_dims(X.nan_triangle())
 
+    def _drop_adjustment(self, X, link_ratio):
+        weight = X.nan_triangle()[:, :-1]
+        if self.drop_high is not None:
+            weight = weight*self._drop_hilo('high', X, link_ratio)
+        if self.drop_low is not None:
+            weight = weight*self._drop_hilo('low', X, link_ratio)
+        if self.drop is not None:
+            weight = weight*self._drop(X)
+        if self.drop_valuation is not None:
+            weight = weight*self._drop_valuation(X)
+        return weight
+
+    def _drop_hilo(self, kind, X, link_ratio):
+        link_ratio[link_ratio == 0] = np.nan
+        lr_valid_count = np.sum(~np.isnan(link_ratio)[0, 0], axis=0)
+        if kind == 'high':
+            vals = np.nanmax(link_ratio, -2, keepdims=True)
+            drop_hilo = self.drop_high
+        else:
+            vals = np.nanmin(link_ratio, -2, keepdims=True)
+            drop_hilo = self.drop_low
+        hilo = 1*(vals != link_ratio)
+        if type(drop_hilo) is bool:
+            drop_hilo = [drop_hilo]*(len(X.development)-1)
+        for num, item in enumerate(self.average_):
+            if not drop_hilo[num]:
+                hilo[..., num] = hilo[..., num]*0+1
+            else:
+                if lr_valid_count[num] < 3:
+                    hilo[..., num] = hilo[..., num]*0+1
+                    warnings.warn('drop_high and drop_low cannot be computed '
+                                  'when less than three LDFs are present. '
+                                  'Ignoring exclusions in some cases.')
+        return hilo
+
+    def _drop_valuation(self, X):
+        if type(self.drop_valuation) is not list:
+            drop_valuation = [self.drop_valuation]
+        else:
+            drop_valuation = self.drop_valuation
+        arr = 1-np.nan_to_num(X[X.valuation.isin(
+            pd.PeriodIndex(drop_valuation,
+                           freq=X.origin_grain))].values[0, 0]*0+1)
+        ofill = X.shape[-2]-arr.shape[-2]
+        dfill = X.shape[-1]-arr.shape[-1]
+        np.repeat(np.expand_dims(np.ones(arr.shape[-1]), 0), ofill, 0)
+        if ofill > 0:
+            arr = np.concatenate((arr, np.repeat(
+                np.expand_dims(np.ones(arr.shape[-1]), 0), ofill, 0)), 0)
+        if dfill > 0:
+            arr = np.concatenate((arr, np.repeat(
+                np.expand_dims(np.ones(arr.shape[-2]), -1), dfill, -1)), -1)
+        return arr[:, :-1]
+
+    def _drop(self, X):
+        drop = [self.drop] if type(self.drop) is not list else self.drop
+        arr = X.nan_triangle()
+        for item in drop:
+            arr[np.where(X.origin == item[0])[0][0],
+                np.where(X.development == item[1])[0][0]] = 0
+        return arr[:, :-1]
+
     def fit(self, X, y=None, sample_weight=None):
         """Fit the model with X.
 
@@ -129,17 +206,18 @@ def fit(self, X, y=None, sample_weight=None):
             average = self.average
         average = np.array(average)
         self.average_ = average
-        weight_dict = {'regression': 2, 'volume': 1, 'simple': 0}
-        _x = tri_array[..., :-1]
-        _y = tri_array[..., 1:]
-        val = np.array([weight_dict.get(item.lower(), 2)
+        weight_dict = {'regression': 0, 'volume': 1, 'simple': 2}
+        x, y = tri_array[..., :-1], tri_array[..., 1:]
+        val = np.array([weight_dict.get(item.lower(), 1)
                         for item in average])
         for i in [2, 1, 0]:
             val = np.repeat(np.expand_dims(val, 0), tri_array.shape[i], axis=0)
-        val = np.nan_to_num(val * (_y * 0 + 1))
-        _w = self._assign_n_periods_weight(X) / (_x**(val))
-        self.w_ = self._assign_n_periods_weight(X)
-        params = WeightedRegression(axis=2, thru_orig=True).fit(_x, _y, _w)
+        val = np.nan_to_num(val * (y * 0 + 1))
+        link_ratio = np.divide(y, x, where=np.nan_to_num(x) != 0)
+        self.w_ = self._assign_n_periods_weight(X) * \
+                  self._drop_adjustment(X, link_ratio)
+        w = self.w_ / (x**(val))
+        params = WeightedRegression(axis=2, thru_orig=True).fit(x, y, w)
         if self.n_periods != 1:
             params = params.sigma_fill(self.sigma_interpolation)
         else:
@@ -148,7 +226,7 @@ def fit(self, X, y=None, sample_weight=None):
                           ' statistics.  Only LDFs have been calculated.')
         params.std_err_ = np.nan_to_num(params.std_err_) + \
             np.nan_to_num((1-np.nan_to_num(params.std_err_*0+1)) *
-            params.sigma_/np.swapaxes(np.sqrt(_x**(2-val))[..., 0:1, :], -1, -2))
+            params.sigma_/np.swapaxes(np.sqrt(x**(2-val))[..., 0:1, :], -1, -2))
         params = np.concatenate((params.slope_,
                                  params.sigma_,
                                  params.std_err_), 3)

diff --git a/chainladder/development/tests/test_development.py b/chainladder/development/tests/test_development.py
@@ -22,8 +22,8 @@ def mack_p(data, average, est_sigma):
     return cl.Development(average=average, sigma_interpolation=est_sigma).fit_transform(cl.load_dataset(data))
 
 
-data = ['RAA', 'ABC', 'GenIns', 'M3IR5', 'MW2008', 'MW2014']
-averages = [('simple', 2), ('volume', 1), ('regression', 0)]
+data = ['RAA', 'GenIns', 'MW2014']
+averages = [('simple', 0), ('volume', 1), ('regression', 2)]
 est_sigma = [('mack', 'Mack'), ('log-linear', 'log-linear')]
 
 
@@ -36,6 +36,16 @@ def test_full_slice2():
     assert cl.Development().fit_transform(cl.load_dataset('GenIns')).ldf_ == \
         cl.Development(n_periods=[1000]*(cl.load_dataset('GenIns').shape[3]-1)).fit_transform(cl.load_dataset('GenIns')).ldf_
 
+def test_drop1():
+    raa = cl.load_dataset('raa')
+    assert cl.Development(drop=('1982', 12)).fit(raa).ldf_.values[0, 0, 0, 0] == \
+           cl.Development(drop_high=[True]+[False]*8).fit(raa).ldf_.values[0, 0, 0, 0]
+
+def test_drop2():
+    raa = cl.load_dataset('raa')
+    assert cl.Development(drop_valuation='1981').fit(raa).ldf_.values[0, 0, 0, 0] == \
+           cl.Development(drop_low=[True]+[False]*8).fit(raa).ldf_.values[0, 0, 0, 0]
+
 def test_n_periods():
     d = cl.load_dataset('usauto')['incurred']
     return np.all(np.round(np.unique(

diff --git a/chainladder/methods/mack.py b/chainladder/methods/mack.py
@@ -64,7 +64,7 @@ def fit(self, X, y=None, sample_weight=None):
     def full_std_err_(self):
         obj = copy.deepcopy(self.X_)
         tri_array = self.full_triangle_.values
-        weight_dict = {'regression': 2, 'volume': 1, 'simple': 0}
+        weight_dict = {'regression': 0, 'volume': 1, 'simple': 2}
         val = np.array([weight_dict.get(item.lower(), 2)
                         for item in list(self.average_) + ['volume']])
         for i in [2, 1, 0]:

diff --git a/chainladder/methods/tests/test_mack.py b/chainladder/methods/tests/test_mack.py
@@ -27,9 +27,9 @@ def mack_p(data, average, est_sigma, tail):
         return cl.MackChainladder().fit(cl.Development(average=average, sigma_interpolation=est_sigma).fit_transform(cl.load_dataset(data)))
 
 
-data = ['RAA', 'ABC', 'GenIns', 'MW2008', 'MW2014']
+data = ['ABC', 'MW2008']
 tail = [True, False]
-averages = [('simple', 2), ('volume', 1), ('regression', 0)]
+averages = [('simple', 0), ('volume', 1), ('regression', 2)]
 est_sigma = [('log-linear', 'log-linear'), ('mack', 'Mack')]
 
 

diff --git a/chainladder/tails/tests/test_exponential.py b/chainladder/tails/tests/test_exponential.py
@@ -23,7 +23,7 @@ def mack_p_no_tail(data, average, est_sigma):
 
 data = ['RAA', 'ABC', 'GenIns', 'MW2008', 'MW2014']
 # M3IR5 in R fails silently on exponential tail. Python actually computes it.
-averages = [('simple', 2), ('volume', 1), ('regression', 0)]
+averages = [('simple', 0), ('volume', 1), ('regression', 2)]
 est_sigma = [('mack', 'Mack'), ('log-linear', 'log-linear')]
 
 

diff --git a/docs/auto_examples/auto_examples_python.zip b/docs/auto_examples/auto_examples_python.zip
diff --git a/docs/auto_examples/plot_benktander_codeobj.pickle b/docs/auto_examples/plot_benktander_codeobj.pickle
diff --git a/docs/auto_examples/plot_bf_apriori_from_cl_codeobj.pickle b/docs/auto_examples/plot_bf_apriori_from_cl_codeobj.pickle
diff --git a/docs/auto_examples/plot_bootstrap_codeobj.pickle b/docs/auto_examples/plot_bootstrap_codeobj.pickle
diff --git a/docs/auto_examples/plot_capecod_codeobj.pickle b/docs/auto_examples/plot_capecod_codeobj.pickle
diff --git a/docs/auto_examples/plot_development_periods_codeobj.pickle b/docs/auto_examples/plot_development_periods_codeobj.pickle
diff --git a/docs/auto_examples/plot_exhibits_codeobj.pickle b/docs/auto_examples/plot_exhibits_codeobj.pickle
diff --git a/docs/auto_examples/plot_mack_codeobj.pickle b/docs/auto_examples/plot_mack_codeobj.pickle
diff --git a/docs/auto_examples/plot_munich_codeobj.pickle b/docs/auto_examples/plot_munich_codeobj.pickle
diff --git a/docs/auto_examples/plot_triangle_from_pandas_codeobj.pickle b/docs/auto_examples/plot_triangle_from_pandas_codeobj.pickle
diff --git a/docs/auto_examples/plot_triangle_slicing_codeobj.pickle b/docs/auto_examples/plot_triangle_slicing_codeobj.pickle
diff --git a/docs/modules/development.rst b/docs/modules/development.rst
@@ -13,8 +13,41 @@ Loss Development Patterns
 Basic Development
 ==================
 
-:class:`Development` allows for the selection of loss development patterns.
-
+:class:`Development` allows for the selection of loss development patterns. Many
+of the typical averaging techniques are available in this class. As well as the
+ability to exclude certain patterns from the LDF calculation.
+
+Single Development Adjustment vs Entire Triangle adjustment
+-----------------------------------------------------------
+
+Most of the arguments of the ``Development`` class can be specified for each
+development period separately.  When adjusting individual development periods
+a list is required that defines the argument for each development.
+
+**Example:**
+   >>> import chainladder as cl
+   >>> raa = cl.load_dataset('raa')
+   >>> cl.Development(average=['volume']+['simple']*8).fit(raa)
+
+This approach works for ``average``, ``n_periods``, ``drop_high`` and ``drop_low``.
+
+Omitting link ratios
+--------------------
+There are several arguments for dropping individual cells from the triangle as
+well as excluding whole valuation periods or highs and lows.  Any combination
+of the 'drop' arguments is permissible.
+
+**Example:**
+   >>> import chainladder as cl
+   >>> raa = cl.load_dataset('raa')
+   >>> cl.Development(drop_high=True, drop_low=True).fit(raa)
+   >>> cl.Development(drop_valuation='1985').fit(raa)
+   >>> cl.Development(drop=[('1985', 12), ('1987', 24)]).fit(raa)
+   >>> cl.Development(drop=('1985', 12), drop_valuation='1988').fit(raa)
+
+.. note::
+  ``drop_high`` and ``drop_low`` are ignored in cases where the number of link
+  ratios available for a given development period is less than 3.
 
 .. _incremental: