Skip to content

Commit

Permalink
updated nan implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
kasperjo committed Mar 25, 2024
1 parent 2fc73fe commit 3b158ae
Show file tree
Hide file tree
Showing 4 changed files with 1,638 additions and 9,177 deletions.
98 changes: 36 additions & 62 deletions cvx/covariance/ewma.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import warnings
from collections import namedtuple

import numpy as np
Expand All @@ -14,7 +13,7 @@ def _generator2frame(generator):
return pd.DataFrame(dict(generator)).transpose()


def _variance(returns, halflife, min_periods=0, clip_at=None, nan_to_num=False):
def _variance(returns, halflife, min_periods=0, clip_at=None):
"""
Estimate variance from returns using an exponentially weighted moving average (EWMA)
Expand All @@ -28,7 +27,6 @@ def _variance(returns, halflife, min_periods=0, clip_at=None, nan_to_num=False):
halflife=halflife,
min_periods=min_periods,
clip_at=clip_at,
nan_to_num=nan_to_num,
)


Expand All @@ -53,7 +51,7 @@ def volatility(returns, halflife, min_periods=0, clip_at=None):
)


def center(returns, halflife, min_periods=0, mean_adj=False, nan_to_num=False):
def center(returns, halflife, min_periods=0, mean_adj=False):
"""
Center returns by subtracting their exponentially weighted moving average (EWMA)
Expand All @@ -70,7 +68,6 @@ def center(returns, halflife, min_periods=0, mean_adj=False, nan_to_num=False):
data=returns,
halflife=halflife,
min_periods=min_periods,
nan_to_num=nan_to_num,
)
)
return (returns - mean).dropna(how="all", axis=0), mean
Expand Down Expand Up @@ -101,8 +98,25 @@ def iterated_ewma(
mu_halflife1=None,
mu_halflife2=None,
clip_at=None,
nan_to_num="default",
nan_to_num=True,
):
"""
param returns: pandas dataframe with returns for each asset
param vola_halflife: half life for volatility
param cov_halflife: half life for covariance
param min_preiods_vola: minimum number of periods to use for volatility
ewma estimate
param min_periods_cov: minimum number of periods to use for covariance
ewma estimate
param mean: whether to estimate mean; if False, assumes zero mean data
param mu_halflife1: half life for the first mean adjustment; if None, it is
set to vola_halflife; only applies if mean=True
param mu_halflife2: half life for the second mean adjustment; if None, it is
set to cov_halflife; only applies if mean=True
param clip_at: winsorizes ewma update at +-clip_at*(current ewma) in ewma;
if None, no winsorization is performed
nan_to_num: if True, replace NaNs in returns with 0.0
"""
mu_halflife1 = mu_halflife1 or vola_halflife
mu_halflife2 = mu_halflife2 or cov_halflife

Expand All @@ -112,7 +126,13 @@ def scale_cov(vola, matrix):
matrix = matrix.values

# Convert (covariance) matrix to correlation matrix
v = 1 / np.sqrt(np.diagonal(matrix)).reshape(-1, 1)
# v = 1 / np.sqrt(np.diagonal(matrix)).reshape(-1, 1)
diag = np.diagonal(matrix).copy()
one_inds = np.where(diag == 0)[0]
if one_inds.size > 0:
diag[one_inds] = 1 # temporarily, to avoid division by zero
v = 1 / np.sqrt(diag).reshape(-1, 1)
v[one_inds] = np.nan
matrix = v * matrix * v.T

cov = vola.reshape(-1, 1) * matrix * vola.reshape(1, -1)
Expand All @@ -122,33 +142,31 @@ def scale_cov(vola, matrix):
def scale_mean(vola, vec1, vec2):
return vec1 + vola * vec2

if nan_to_num:
if returns.isna().any().any():
returns = returns.fillna(0.0)

# compute the moving mean of the returns

# TODO: Check if this is correct half life
returns, returns_mean = center(
returns=returns, halflife=mu_halflife1, min_periods=0, mean_adj=mean
)

# estimate the volatility, clip some returns before they enter the estimation
# estimate the volatility, clip some returns before they enter the
# estimation
vola = volatility(
returns=returns,
halflife=vola_halflife,
min_periods=min_periods_vola,
clip_at=clip_at,
)

# wherever vola is not zero: adj = clip((returns / vola), clip_at=clip_at)
# wherever vola is zero: adj = NaN

# adj = returns / vola

# adj the returns
# make sure adj is NaN where vola is zero or returns are
# NaN. This handles NaNs (which in some sense is the same as a constant
# return series, i.e., vola = 0)
adj = clip((returns / vola), clip_at=clip_at) # if vola is zero, adj is NaN
# print((adj.isna()).sum())
# adj[vola == 0] = np.nan
adj = adj.fillna(0.0)

# center the adj returns again
adj, adj_mean = center(adj, halflife=mu_halflife2, min_periods=0, mean_adj=mean)
Expand All @@ -157,7 +175,6 @@ def scale_mean(vola, vec1, vec2):
data=adj,
halflife=cov_halflife,
min_periods=min_periods_cov,
nan_to_num=nan_to_num,
):
if mean:
m = scale_mean(
Expand All @@ -173,7 +190,7 @@ def scale_mean(vola, vec1, vec2):
)


def _ewma_cov(data, halflife, min_periods=0, nan_to_num=True):
def _ewma_cov(data, halflife, min_periods=0):
"""
param data: Txn pandas DataFrame of returns
param halflife: float, halflife of the EWMA
Expand All @@ -185,20 +202,18 @@ def _ewma_cov(data, halflife, min_periods=0, nan_to_num=True):
halflife=halflife,
fct=lambda x: np.outer(x, x),
min_periods=min_periods,
nan_to_num=nan_to_num,
):
if not np.isnan(ewma).all():
yield t, pd.DataFrame(index=data.columns, columns=data.columns, data=ewma)


def _ewma_mean(data, halflife, min_periods=0, clip_at=None, nan_to_num=False):
def _ewma_mean(data, halflife, min_periods=0, clip_at=None):
for t, ewma in _general(
data.values,
times=data.index,
halflife=halflife,
min_periods=min_periods,
clip_at=clip_at,
nan_to_num=nan_to_num,
):
# converting back into a series costs some performance but adds robustness
if not np.isnan(ewma).all():
Expand All @@ -213,7 +228,6 @@ def _general(
fct=lambda x: x,
min_periods=0,
clip_at=None,
nan_to_num="default",
):
"""
y: frame with measurements for times t=t_1,t_2,...,T
Expand All @@ -222,8 +236,6 @@ def _general(
fct: function to apply to y
min_periods: minimum number of observations to start EWMA
clip_at: clip y_last at +- clip_at*EWMA (optional)
nan_to_num: if True, replace NaNs in y with 0.0; "default" means True if
y[i] is 2D and False otherwise
returns: a generator over (t_i, EWMA of fct(y_i))
"""
Expand All @@ -234,13 +246,6 @@ def f(k):

return times[k], _ewma

if nan_to_num == "default":
if y[0].ndim == 2:
nan_to_num = True
y = np.nan_to_num(y) # ensures covariances are PSD
else:
nan_to_num = False

if halflife:
alpha = 1 - np.exp(-np.log(2) / halflife)

Expand All @@ -256,16 +261,6 @@ def f(k):

next_val = fct(row)

# ### Merge NaNs from next_val and _ewma
nans = False
mask_next_val = np.isnan(next_val)
mask_ewma = np.isnan(_ewma)

if (mask_next_val != mask_ewma).any():
nans = True
next_val[mask_next_val] = _ewma[mask_next_val]
_ewma[mask_ewma] = next_val[mask_ewma]

# update the moving average
if clip_at and n >= min_periods + 1:
_ewma = _ewma + (1 - beta) * (
Expand All @@ -276,25 +271,4 @@ def f(k):
1 - np.power(beta, n + 1)
)

# if nans and _ewma.ndim == 2: # project _ewma onto PSD cone
# nan_entries = np.isnan(_ewma).all(axis=0)
# _ewma_clean = _ewma[~nan_entries][:, ~nan_entries]

# print(pd.DataFrame(_ewma_clean))

# eigvals, eigvecs = np.linalg.eigh(_ewma_clean)
# eigvals = np.maximum(eigvals, 0)
# _ewma_clean = eigvecs @ np.diag(eigvals) @ eigvecs.T

# # Update _ewma with the cleaned data
# non_nan_indices = np.where(~nan_entries)[0] # Get indices of non-NaN entries
# _ewma[np.ix_(non_nan_indices, non_nan_indices)] = _ewma_clean
if nans:
warnings.warn(
f"A new asset entered at time {times[n]} (or the first time "
f"the return of this asset changed). This may result in "
f"non-PSD covariance matrices. Consider using nan_to_num=True.",
Warning,
)

yield f(k=n)
10,676 changes: 1,581 additions & 9,095 deletions experiments/extensions/playground.ipynb

Large diffs are not rendered by default.

Loading

0 comments on commit 3b158ae

Please sign in to comment.