Skip to content

Commit

Permalink
Temp changes
Browse files Browse the repository at this point in the history
  • Loading branch information
jmbejara committed Jul 25, 2024
1 parent f26d331 commit f148532
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 0 deletions.
Binary file added week_4/__pycache__/config.cpython-39.pyc
Binary file not shown.
Binary file added week_4/__pycache__/my_plotting_module.cpython-39.pyc
Binary file not shown.
61 changes: 61 additions & 0 deletions week_4/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Provides easy access to paths and credentials used in the project.
Meant to be used as an imported module.
Example
-------
import config
path = config.output_dir
path
## The config YAML should look something like this:
# config.yml
default:
data_dir: "C:/My Documents/data/misc_project"
private_data_dir: "D:/My Documents/private_data/misc_project"
output_dir: "C:/Users/jdoe/GitRepositories/misc_project/output"
wrds_username: "jdoe"
AWS:
data_dir: "/data/awshomes/jdoe/data/misc_project"
private_data_dir: "/data/awshomes/jdoe/private_data/misc_project"
output_dir: "/data/awshomes/jdoe/GitRepositories/INT_misc_project/output"
"""
import yaml
from pathlib import Path

with open("../config.yml") as f:
config = yaml.safe_load(f)

def _read_config_entry(upper_key, lower_key):
entry = config[upper_key][lower_key]
if entry is None:
p = None
else:
p = Path(entry)
return p

def switch_to(pathset_name='default'):
global data_dir
global private_data_dir
global output_dir
global pathset

data_dir = _read_config_entry(pathset_name, "data_dir")
private_data_dir = _read_config_entry(pathset_name, "private_data_dir")
output_dir = _read_config_entry(pathset_name, "output_dir")
pathset = pathset_name

def read(key):
upper_key = pathset
value = config[upper_key][key]
return value

switch_to(pathset_name='default')

if __name__ == "__main__":
pass


20 changes: 20 additions & 0 deletions week_4/example_plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path

import config




import my_plotting_module

config.read('output_dir')

my_plotting_module.plot_sine_function(N=5)
my_plotting_module.plot_sine_function(N=15)

plt.clf()
my_plotting_module.plot_sine_function(N=1000)
plt.savefig(config.output_dir / 'mysine.png')
9 changes: 9 additions & 0 deletions week_4/my_plotting_module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt


def plot_sine_function(N=1000):
x = np.linspace(-10, 10, N)
y = np.sin(x)
plt.plot(x, y)
Binary file added week_4/mysine.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added week_4/pca.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
175 changes: 175 additions & 0 deletions week_4/pca_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# %% [markdown]
# # Factor Analysis and Principal Component Analysis on Financial and Economic Time Series

# %%
# If you're running this on Colab, make sure to install the following packages using pip.
# On you're own computer, I recommend using conda or mamba.

# !pip install pandas-datareader
# !pip install yfinance

# !conda install pandas-datareader
# !conda install yfinance

# %%
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import yfinance as yf
import pandas_datareader as pdr
import sklearn.decomposition
import statsmodels.multivariate.pca

start_date = pd.to_datetime('1980-01-01')
end_date = pd.to_datetime('today')

# %%
fred_series_short_names = {
'BAMLH0A0HYM2': 'High Yield Index OAS',
'NASDAQCOM': 'NASDAQ',
'RIFSPPFAAD90NB': '90-Day AA Fin CP',
'DTB3': '3-Month T-Bill',
'DGS10': '10-Year Treasury',
'VIXCLS': 'VIX',
}
df = pdr.get_data_fred(fred_series_short_names.keys(), start=start_date, end=end_date)
df = df.rename(columns=fred_series_short_names)

# %%
df

# %%
df.dropna()

# %% [markdown]
# ## Transforming and Normalizing the data
#
# What is transformation and normalization? Are these different things?
#
# - Why would one transform data? What is feature engineering?
# - What is normalization?
#
# What does stationarity mean? See the the following plots. Some of these variable are stationary. Other are not? Why is this a problem?


# %% [markdown]
# Let's try some transformations like those used in the OFR Financial Stress Index: https://www.financialresearch.gov/financial-stress-index/files/indicators/index.html

# %%
dfn = pd.DataFrame().reindex_like(df)
dfn

# %%
df['NASDAQ'].rolling(250).mean()

# %%
df = df.dropna()

# %%
df['NASDAQ'].rolling(250).mean()

# %%
# 'High Yield Index OAS': Leave as is
dfn['High Yield Index OAS'] = df['High Yield Index OAS']
dfn['CP - Treasury Spread, 3m'] = df['90-Day AA Fin CP'] - df['10-Year Treasury']
# 'NASDAQ': # We're using something different, but still apply rolling mean transformation
dfn['NASDAQ'] = df['NASDAQ'] - df['NASDAQ'].rolling(250).mean()
dfn['10-Year Treasury'] = df['10-Year Treasury'] - df['10-Year Treasury'].rolling(250).mean()
# 'VIX': Leave as is
dfn['VIX'] = df['VIX']

# %%
dfn = dfn.drop(columns=['90-Day AA Fin CP', '3-Month T-Bill'])
dfn = dfn.dropna()

# %% [markdown]
# We finished with our transformations. Now, let's normalize. First, why is it important?

# %% [markdown]
# Now, normalize each column,
# $$
# z = \frac{x - \bar x}{\text{std}(x)}
# $$

# %%
dfn = (dfn - dfn.mean()) / dfn.std()

# %%
def pca(dfn, module='scikitlearn'):
if module == 'statsmodels':
_pc1, _loadings, projection, rsquare, _, _, _ = statsmodels.multivariate.pca.pca(dfn,
ncomp=1, standardize=True, demean=True, normalize=True, gls=False,
weights=None, method='svd')
_loadings = _loadings['comp_0']
loadings = np.std(_pc1) * _loadings
pc1 = _pc1 / np.std(_pc1)
pc1 = pc1.rename(columns={'comp_0':'PC1'})['PC1']

elif module == 'scikitlearn':
pca = sklearn.decomposition.PCA(n_components=1)
_pc1 = pd.Series(pca.fit_transform(dfn)[:,0], index=dfn.index, name='PC1')
_loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
_loadings = pd.Series(_loadings[:,0], index=dfn.columns)

loadings = np.std(_pc1) * _loadings
pc1 = _pc1 / np.std(_pc1)
pc1.name = 'PC1'
else:
raise ValueError



loadings.name = "loadings"

return pc1, loadings

def stacked_plot(df, filename=None):
"""
df=category_contributions
# category_contributions.sum(axis=1).plot()
"""

df_pos = df[df >= 0]
df_neg = df[df < 0]

alpha = .3
linewidth = .5

ax = df_pos.plot.area(alpha=alpha, linewidth=linewidth, legend=False)
pc1 = df.sum(axis=1)
pc1.name = 'pc1'
pc1.plot(color="Black", label='pc1', linewidth=1)


plt.legend()
ax.set_prop_cycle(None)
df_neg.plot.area(alpha=alpha, ax=ax, linewidth=linewidth, legend=False, ylim=(-3,3))
# recompute the ax.dataLim
ax.relim()
# update ax.viewLim using the new dataLim
ax.autoscale()
# ax.set_ylabel('Standard Deviations')
# ax.set_ylim(-3,4)
# ax.set_ylim(-30,30)

if not (filename is None):
filename = Path(filename)
figure = plt.gcf() # get current figure
figure.set_size_inches(8, 6)
plt.savefig(filename, dpi=300)


# %%
pc1, loadings = pca(dfn, module='scikitlearn')

plt.clf()
# %%
pc1.plot()
plt.savefig('pca.png')


# # %%
# stacked_plot(dfn)


0 comments on commit f148532

Please sign in to comment.