-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_prepare.py
105 lines (80 loc) · 4.6 KB
/
data_prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
from tqdm import tqdm
import os
import zipfile
from joblib import delayed, Parallel
from itertools import product
from utils import CHARAS_LIST
import warnings
warnings.filterwarnings('ignore')
if 'data' not in os.listdir():
os.mkdir('data')
os.system('wget https://cloud.tsinghua.edu.cn/f/179082ecf0f147a4840c/?dl=1 -O portfolio_ret.pkl')
os.system('wget https://cloud.tsinghua.edu.cn/f/b93c6ae7e2014d3a951e/?dl=1 -O ff5.csv')
os.system('wget https://cloud.tsinghua.edu.cn/f/5f077be9eda0428ab7e5/?dl=1 -O UMD.csv')
os.system('wget https://cloud.tsinghua.edu.cn/f/a916da12d5a9450eb0df/?dl=1 -O p_charas.pkl')
os.system('mv portfolio_ret.pkl data')
os.system('mv ff5.csv data')
os.system('mv UMD.csv data')
os.system('mv p_charas.pkl data')
f = 'data/ret.csv'
print('Reading ret.csv', end=' ')
mon_ret = pd.read_pickle(f)
mon_ret.to_pickle('data/ret.pkl')
print('Done!')
f = 'data/datashare_to2021.csv'
print('Reading datashare_to2021.csv', end=' ')
datashare = pd.read_pickle(f)
datashare['DATE'].drop_duplicates().reset_index(drop=True).to_pickle('data/mon_list.pkl')
datashare.to_pickle('data/datashare_to2021.pkl')
print('Done!')
def pre_process(date):
cross_slice = datashare.loc[datashare.DATE == date].copy(deep=False)
omitted_mask = 1.0 * np.isnan(cross_slice.loc[cross_slice['DATE'] == date])
# fill nan values with each factors median
cross_slice.loc[cross_slice.DATE == date] = cross_slice.fillna(0) + omitted_mask * cross_slice.median()
# if all stocks' factor is nan, fill by zero
cross_slice.loc[cross_slice.DATE == date] = cross_slice.fillna(0)
re_df = []
# rank normalization
for col in CHARAS_LIST:
series = cross_slice[col]
de_duplicate_slice = pd.DataFrame(series.drop_duplicates().to_list(), columns=['chara'])
series = pd.DataFrame(series.to_list(), columns=['chara'])
# sort and assign rank, the same value should have the same rank
de_duplicate_slice['sort_rank'] = de_duplicate_slice['chara'].argsort().argsort()
rank = pd.merge(series, de_duplicate_slice, left_on='chara', right_on='chara', how='right')['sort_rank']
# if all values are zero, the results will contain nan
rank_normal = ((rank - rank.min())/(rank.max() - rank.min())*2 - 1)
re_df.append(rank_normal)
re_df = pd.DataFrame(re_df, index=CHARAS_LIST).T.fillna(0)
re_df['permno'] = list(cross_slice['permno'].astype(int))
re_df['DATE'] = list(cross_slice['DATE'].astype(int))
return re_df[['permno', 'DATE'] + CHARAS_LIST]
def cal_portfolio_ret(it, df):
d, f = it[0], it[1]
# long portfolio, qunatile 0.0~0.1; short portfolio, qunatile 0.9~1.0
long_portfolio = df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[:df.loc[df.DATE == d].shape[0]//10]['permno'].to_list()
short_portfolio = df.loc[df.DATE == d][['permno', f]].sort_values(by=f, ascending=False)[-df.loc[df.DATE == d].shape[0]//10:]['permno'].to_list()
# long-short portfolio return
long_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(long_portfolio)['return'].dropna().mean()
short_ret = mon_ret.loc[mon_ret.date == d].drop_duplicates('permno').set_index('permno').reindex(short_portfolio)['return'].dropna().mean()
chara_ret = 0.5*(long_ret - short_ret)
return chara_ret
def cal_portfolio_charas(month, df):
mon_portfolio_chara = []
p_name = ['p_' + chr for chr in CHARAS_LIST]
for chr in CHARAS_LIST:
long_portfolio = df.loc[df.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[:df.loc[df.DATE == month].shape[0]//10]['permno'].to_list()
short_portfolio = df.loc[df.DATE == month].sort_values(by=chr, ascending=False).reset_index(drop=True)[-df.loc[df.DATE == month].shape[0]//10:]['permno'].to_list()
long_charas = df.loc[df.DATE == month].set_index('permno').loc[long_portfolio][CHARAS_LIST]
short_charas = df.loc[df.DATE == month].set_index('permno').loc[short_portfolio][CHARAS_LIST]
mon_portfolio_chara.append([month] + (0.5*(long_charas.mean() - short_charas.mean())).to_list())
return pd.DataFrame(mon_portfolio_chara, index=p_name, columns=['DATE']+CHARAS_LIST)
if __name__ == '__main__':
# pre-process share data
processed_df = Parallel(n_jobs=-1)(delayed(pre_process)(d) for d in tqdm(datashare.DATE.drop_duplicates().to_list(), colour='green', desc='Processing'))
processed_df = pd.concat(processed_df)
processed_df.to_pickle('data/datashare_re.pkl')