-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathmain.Rmd
948 lines (715 loc) · 54 KB
/
main.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
---
title: "Regime Switching and Technical Trading with Dynamic Bayesian Networks in High-Frequency Stock Markets"
author: "Luis Damiano, Brian Peterson, Michael Weylandt"
date: "`r Sys.Date()`"
output:
pdf_document:
toc: true
number_sections: true
citation_package: none
keep_tex: true
includes:
in_header: ../common/Rmd/preamble-latex.Rmd
bibliography: ../common/references.bib
---
```{r setup, include = FALSE}
knitr::opts_chunk$set(echo = FALSE, cache = TRUE, include = FALSE, fig.pos = 'H', warning = FALSE)
knitr::opts_knit$set(global.par = TRUE)
```
```{r}
library(xtable)
setglobals <- function() { # What a beautiful hack!
# global.par = TRUE won't do it
options(xtable.sanitize.text.function = identity,
xtable.sanitize.rownames.function = identity,
xtable.sanitize.colnames.function = identity,
xtable.sanitize.subheadings.function = identity,
xtable.sanitize.message.function = identity,
xtable.comment = FALSE,
xtable.booktabs = TRUE,
xtable.floating = TRUE,
xtable.latex.environments = "center",
xtable.tabular.environment = "tabularx")
par(cex.names = 0.70, cex.axis = 0.70, cex.lab = 0.70, cex.main = 0.70)
}
pint <- function(x) { print(formatC(x, format = "d", big.mark = ",")) }
pflo <- function(x) { print(formatC(x, format = "f", big.mark = ",", decimal.mark = ".")) }
# ptab <- function(x) { sapply(x, function(e) {sprintf("%0.2f", e)}) }
pmat <- function(mat, row.names = NULL, digits = NULL, ...) {
if (!is.null(digits)) {mat <- round(mat, digits)}
if (!is.null(row.names)) {mat <- cbind(row.names, mat)}
knitr::kable(mat, booktabs = TRUE, row.names = FALSE, ...)
}
```
```{r}
library(digest)
library(highfrequency)
library(moments)
library(rstan)
library(shinystan)
library(xts)
source('../common/R/plots.R')
source('R/constants.R')
source('R/feature-extraction.R')
source('R/trading-rules.R')
source('R/state-plots.R')
# Set up! -----------------------------------------------------------------
# Data kept in a private folder as we do no have redistribution rights
data.files <- c('data/G.TO/2007.05.04.G.TO.RData',
'data/G.TO/2007.05.07.G.TO.RData',
'data/G.TO/2007.05.08.G.TO.RData',
'data/G.TO/2007.05.09.G.TO.RData',
'data/G.TO/2007.05.10.G.TO.RData',
'data/G.TO/2007.05.11.G.TO.RData')
# Timespans to separate training and test sets
ins <- '2007-05-04 09:30:00/2007-05-10 16:30:00'
oos <- '2007-05-11 09:30:00/2007-05-11 16:30:00'
# Alpha threshold in the change of volumen setting (0.25 = Tayal 2009)
features.alpha <- 0.25
# HHMM structure: K production/emission states, L possible outcomes
K = 4
L = 9
# MCMC settings
n.iter = 500
n.warmup = 250
n.chains = 1
n.cores = 1
n.thin = 1
n.seed = 9000
cache.path = 'fore_cache'
# A naive implementation to cache Stan fit objects
# Sampler won't run twice under exactly same setup
# NULL = No cache!
```
```{r results = "asis"}
# cat("<style>img{border: 0px !important;} tbody{font-size:9px;}</style>")
```
This work aims at replicating the Hierarchical Hidden Markov Model (HHMM) originally proposed by @tayal2009regime to learn price and volume patterns using rules from technical analysis, infer the hidden state of the system and identify runs and reversals out-of-sample in a statistically significant way. The main goal is to reproduce the results of the original research as well as to provide additional insight and criticism. Also, we produce public programming code in [Stan](http://mc-stan.org/) [@carpenter2016stan] for a fully Bayesian estimation of the model parameters and inference on hidden quantities. A brief introduction about Hidden Markov Models (HMM) can be found in our [literature review](https://github.com/luisdamiano/gsoc17-hhmm/blob/master/litreview/main.pdf).
The authors acknowledge Google for financial support via the Google Summer of Code 2017 program.
---
# Motivation
The use of Markov-switching models to represent bull and bear markets is not novel. @chauvet2000coincident define bulls and bears according to the general increasing or decreasing trend in prices and uses an unobservable two-state Markov variable to represent investors' real-time belief about the state of financial conditions. @maheu2000identifying succeed in combining return mean and variance to identify all major market downturns in over 160 years of monthly price data. @lunde2004duration, who define market states based on sequences of stopping times tracing local peaks and troughs in prices, find duration dependence in stock prices. The longer a bull market has lasted, the lower is the probability that it will come to a termination. In contrast, the longer a bear market has lasted, the higher is its termination probability. Interest rates affect cumulated changes in stock prices as well. The existence of market states is not only supported by statistical models. @gordon2000preference establishes a link between time-varying prices of risk under the Capital Asset Pricing Model framework and state-dependent preferences.
# Hierarchical Hidden Markov Models
The HHMM is a recursive hierarchical generalization of the HMM that provides a systematic unsupervised approach for complex multi-scale structure. The model is motivated by the multiplicity of length scales and the different stochastic levels (recursive nature) present in some sequences. Additionally, it infers correlated observations over long periods via higher levels of hierarchy.
The model structure is fairly general and allows an arbitrary number of activations of its submodels. The multi-resolution structure is handled by temporal experts^[In Machine Learning terminology, a problem is divided into homogeneous regions addressed by an expert submodel. A gating network or function decides which expert to use for each input region.] of different time scales.
## Model specification
HHMM are structured multi-level stochastic processes that generalize HHM by making each of the hidden states an autonomous probabilistic model. There are two kinds of states: internal states are HHMM that emit sequences by a recursive activation of one of the substates, while production states generate an output symbol according to the probability distribution of the set of output symbols.
Hidden dynamics are lead by transitions. Vertical transitions involve the activation of a substate by an internal state, they may include further vertical transitions to lower level states. Once completed, they return the control to the state that originated the recursive activation chain. Then, a horizontal transition is performed. Its state transition within the same level.
A HHMM can be represented as a standard single level HMM whose states are the production states of the corresponding HHMM with a fully connected structure, i.e. there is a non-zero probability of transition from any state to any other state. This equivalent new model lacks the multi-level structure.
Let $z_{t}^{d} = i$ be the state of an HHMM at the step $t$, where $i \in \{1, \dots, |z^{d}|\}$ is the state index, $|z^{d}|$ is the number of possible steps within the $d$-th level and $d \in \{1, \dots, D\}$ is the hierarchy index taking values $d = 1$ for the root state, $d = \{2, \dots, ..., D-1\}$ for the remaining internal states and $d = D$ for the production states.
In addition to its structure, the model is characterized by the state transition probability between the internal states and the output distribution of the production states. For each internal state $z_t^d$ for $d \in \{1, \dots, D - 1\}$, there is a state transition probability matrix $\mat{A}^d$ with elements $a_{ij}^{d} = p(z_{t}^{d+1} = j | z_{t}^{d+1} = j)$ being the probability of a horizontal transition from the $i$-th state to the $j$-th state within the level $d$. Similarly, there is the initial distribution vector over the substates $\mat{\pi}^d$ with elements $\pi_j^d = p(z_t^{d+1} = j | z_t^d)$ for $d \in \{1, \dots, D - 1\}$. Finally, each production state $z_t^D$ is parametrized by the output parameter vector $\mat{\theta}_o^i$ whose form depends on the specification of the observation model $p(\mat{x}_t | z_t^D = j, \mat{\theta}_o^j)$ corresponding to the $j$-th production state.
## Generative model
The root node initiates a stochastic sequence generation. An observation for the first step in the sequence $t$ is generated by drawing at random one of the possible substates according to the initial state distribution $\mat{\pi}^1$. To replicate the recursive activation process, for each internal state entered $z_t^d$ one of the substates is randomly chosen according to the corresponding initial probability vector $\mat{\pi}^d$. When an internal state transitions to a production state $z_t^D = j$, a single observation is generated according to the state output parameter vector $\mat{\theta}_o^j$. Control returns to the internal state that lead to the current production state $z_t^{D-1}$, which in turns selects the next state in the same level according to transition matrix $\mat{A}^{D-1}$.
Save for the top, each level $d \in \{2, \dots, D\}$ has a final state that terminates the stochastic state activation process and returns the control to the parent state of the whole hierarchy. The generation of the observation sequence is completed when control of all the recursive activations returns to the root state.
## Parameter estimation
The parameters of the models are $\mat{\theta} = \left\{ \left\{ \mat{A}^d \right\}_{d \in \{1, \dots, D - 1\}}, \left\{ \mat{\pi}^d \right\}_{d \in \{1, \dots, D - 1\}}, \left\{ \mat{\theta}_o \right\} \right\}$. The form of $\mat{\theta}_o$ depends on the specification of the production states. We refer the reader to @fine1998hierarchical for a detailed treatment of estimation and inference procedures.
---
# Regime Switching and Technical Trading with Dynamic Bayesian Networks in High-Frequency Stock Markets
## Preamble
Many published works argue for the existence of systematic patterns in price action. @brock1992simple test technical trading rules and strategies. @karpoff1987relation, later followed by @gallant1992stock, make four contributions supporting that volume is a significant source of information for price changes. @lo2000foundations argue that several technical indicators provide incremental information and may have some practical value. Statistical significance does not imply profitability, however, and part of the research towards technical analysis is subject to various problems in their testing procedure [@park2007what].
@tayal2009regime is first to propose a graphical model for technical analysis. The author creates data features based on technical analysis rules and designs a HHMM to learn intraday price and volume patterns. Probabilistic inference allows the identification of two distinct states in high-frequency data that are mainly marked by buying and selling pressure.
## Feature extraction
### Input series
Tick series are a sequence of triples $\{y_k\}$ with $y_k = (t_k, p_k, v_k)$, where $t_k \le t_{k+1}$ is the time stamp in seconds, $p_k$ is the trade price and $v_k$ is the trade volume. The sequence is ordered by the occurrence of trades. There can be more than one trade within a second.
Following @tayal2009regime, who in turns drew inspiration from the technical analysis techniques proposed by @ord2008secret, we derive a zig-zag sequence that captures the bid-ask bounce $\{z_k\}$ with $z_k = (i_n, j_n, e_n, \phi_n)$, where $i_n \le i_j$ are indices to the tick series representing the starting and ending point of the extrema, $e_n = p_k \ \forall \ k : i_n \le k \le j_n$ is the price at the local extrema, and $\phi_m$ measures the average volume per second during the zig-zag leg ending at $e_n$:
\[
\phi_n = \frac{1}{t_{j_n} - t_{i_{n-1}} + 1} \sum_{k = i_{n-1}}^{j_n}{v_k}.
\]
We note that $p_{i_n} < e_n < p_{j_n + 1}$ for local maxima and $p_{i_n} > e_n > p_{j_n + 1}$ for local minima. The average volume, which includes the end-point extrema, is normalized by $t_{j_{n}} - t_{i_{n-1}} + 1$ to avoid division by zero when the zig-zag leg occurs within the same time period. Most importantly, we underline that the $n$-th zig-zag point $z_n$ is realized only after observing the $(j_n + 1)$-th tick point $y_{j_n + 1}$. Failing to consider the one tick lag between leg completion and the time of detection would cause look-ahead bias in the out of sample forecasts.
### Processing rules
Discrete features are created based on the zig-zag series $\{z_n\}$. We first create an auxiliary series $\{O_n\}$ with $O_n = (f_n^0, f_n^1, f_n^2)$, where $f_n^0$ represents the direction of the zig-zag, $f_n^1$ indicates the existence of a trend and $f_n^2$ indicates whether average volume increased or decreased.
Formally,
\[
f_n^0 =
\begin{cases}
+1 & \text{if $e_n$ is a local maximum (positive zig-zag leg)} \\
-1 & \text{if $e_n$ is a local minimum (negative zig-zag leg),} \\
\end{cases}
\]
and
\[
f_n^1 =
\begin{cases}
+1 & \text{if $e_{n-4} < e_{n-2} < e_{n} \wedge e_{n-3} < e_{n-1}$ (up-trend)} \\
-1 & \text{if $e_{n-4} > e_{n-2} > e_{n} \wedge e_{n-3} > e_{n-1}$ (down-trend)} \\
0 & \text{otherwise (no trend).}
\end{cases}
\]
For the third indicator function, we compute the average volume ratios,
\[
\nu_n^1 = \frac{\phi_n}{\phi_{n-1}}, \quad \nu_n^2 = \frac{\phi_n}{\phi_{n-2}}, \quad \nu_n^3 = \frac{\phi_{n-1}}{\phi_{n-2}},
\]
we transform the ratios into a discrete variable using an arbitrary threshold $\alpha$,
\[
\tilde{\nu}_n^j =
\begin{cases}
+1 & \text{if $\nu_n^j - 1 > \alpha$} \\
-1 & \text{if $1 - \nu_n^j > \alpha$} \\
0 & \text{if $|\nu_n^j - 1| \le \alpha$}, \\
\end{cases}
\]
and we finally define
\[
f_n^2 =
\begin{cases}
+1 & \text{if $\tilde{\nu}_n^1 = 1, \tilde{\nu}_n^2 > -1, \tilde{\nu}_n^3 < 1$ (volume strengthens)} \\
-1 & \text{if $\tilde{\nu}_n^1 = -1, \tilde{\nu}_n^2 < -1, \tilde{\nu}_n^3 > -1$ (volume weakens)} \\
0 & \text{otherwise (volume is indeterminant)}. \\
\end{cases}
\]
The features or legs $\mat{D} = \{D_1, \dots, D_9 \}$, $\mat{U} = \{U_1, \dots, U_9 \}$ are then created using the Table $\ref{tab:feature-space}$.
```{r, include = TRUE, results="asis"}
df.col <- c("Zig-zag direction", "Price trend", "Change in volume", "Market State")
df.row <- expand.grid(1:9, c("U", "D"))
df.lab <- list(
list("up" = "Up +1", "dn" = "Down -1"),
list("up" = "Up +1", "nt" = "No trend 0", "dn" = "Down -1"),
list("st" = "Strong +1", "in" = "Intederminant 0", "wk" = "Weak -1"),
list("bu" = "Bull", "lv" = "Local volatility", "be" = "Bear")
)
df <- data.frame(
direction = rep(c('up', 'dn'), each = 9),
trend = rep(c('up', 'dn', 'up', 'nt', 'nt', 'nt', 'dn', 'up', 'dn'), 2),
cgvol = c('st', 'st', 'in', 'st', 'in', 'wk', 'in', 'wk', 'wk', 'wk', 'wk', 'in', 'wk', 'in', 'st', 'in', 'st', 'st'),
state = rep(c(rep('bu', 4), 'lv', rep('be', 4)), 2),
stringsAsFactors = FALSE
)
colnames(df) <- df.col
rownames(df) <- sprintf("$%s_{%s}$", df.row[, 2], df.row[, 1])
for (i in 1:length(df.lab)) {
for (j in 1:length(df.lab[[i]])) {
df[, i] <- gsub(paste0("\\<", names(df.lab[[i]])[j], "\\>"), df.lab[[i]][[j]], df[, i])
}
}
setglobals()
tabs <- xtableList(split(df, df[, 1]),
caption = "Feature space.",
label = "tab:feature-space",
align = c('l', rep('R', 4)))
print(tabs,
colnames.format = "single", tabular.environment = "tabularx",
width = "0.8 \\textwidth", size = "\\footnotesize")
```
Defining appropriate rules that capture trade volume dynamics and identify trends in volume despite high-frequency noise is the most challenging aspect of this design. @wisebourt2011hierarchical proposes a modification for the feature extraction procedure. By computing the spread between the Volume Weighted Average Prices of the bid and the ask, he devices a book imbalance metric that describes the state of the order book at any given moment in time. In turns, @sandoval2015computational applies wavelets over two simple-smoothed exponential distance-weighted average volume series to measure trade volume concentration in both sides of the book.
## Model
\label{sec:model}
We adhere to the methodology proposed in the original work as much as possible. We set up a HHMM to learn the sequence of discrete features extracted from a high-frequency time series of stock prices and traded volume. The figure below summarizes the model structure in the form of a Dynamic Bayesian Network.
The graph starts with a root node $z^0$ that has two top-level children $z_1^1$ and $z_2^1$ representing bullish markets (or runs) and bearish markets (or reversals). The specifications do not pose any constraints that determines beforehand which node takes each of the possible two meanings. In consequence, latent states need to be labeled after the learning stage based on sample characteristics such as mean returns. Although the original author does not mention this possibility, prior information, like parameter ordering, could be embedded to break symmetry and mitigate eventual identification issues.
\definecolor{myred}{RGB}{228, 31, 38}
\begin{figure}[!h]
\centering
\begin{tikzpicture}[
roundnode/.style={circle, draw=myred, very thick, minimum size=7mm},
groundnode/.style={circle, draw=myred, fill=myred!10, very thick, minimum size=7mm},
droundnode/.style={circle, double, draw=myred, very thick, minimum size=7mm},
]
%Nodes
\node[roundnode] (root) {$z^0$};
\node[roundnode] (top1) [below left = 1.0cm and 1.5cm of root] {$z^1_1$};
\node[roundnode] (top2) [below right = 1.0cm and 1.5cm of root] {$z^1_2$};
\node[groundnode, label={\small -}] (bot1) [below left = 1cm of top1] {$z^2_1$};
\node[groundnode, label={\small +}] (bot2) [right = 0.4cm of bot1] {$z^2_2$};
\node[droundnode] (bot51) [below right = 1cm of top1] {$z^2_5$};
\node[groundnode, label={\small +}] (bot3) [below left = 1cm of top2] {$z^2_3$};
\node[groundnode, label={\small -}] (bot4) [right = 0.4cm of bot3] {$z^2_4$};
\node[droundnode] (bot52) [below right = 1cm of top2] {$z^2_5$};
%Lines
\draw[->, dashed] (root) -- (top1);
\draw[->, dashed] (root) -- (top2);
\draw[->] (top1) edge [bend right = 05] (top2);
\draw[->] (top2) edge [bend right = 05] (top1);
\draw[->, dashed] (top1) -- (bot1);
\draw[->, dashed] (bot51) -- (top1);
\draw[->] (bot1) edge [bend right = 10] (bot2);
\draw[->] (bot2) edge [bend right = 10] (bot1);
\draw[->, dashed] (top2) -- (bot3);
\draw[->, dashed] (bot52) -- (top2);
\draw[->] (bot3) edge [bend right = 10] (bot4);
\draw[->] (bot4) edge [bend right = 10] (bot3);
\draw[->] (bot1) edge [bend right = 60] (bot51);
\draw[->] (bot3) edge [bend right = 60] (bot52);
\end{tikzpicture}
\caption{Hierarchical Hidden Markov Model for price and volume.}
\end{figure}
<!-- Top level states transition vertically to a HMM that represent observations from bull or bear market states. Bottom level nodes in gray are emission states that produce one negative or positive zig-zag before transioning horizontally to another node inside the HHM. Bottom level double circled nodes are terminations that return control back to the top level, which is forced to transition horizontally to the alternate top state. -->
Each top node activates a probabilistic HMM with two latent states for negative and positive zig-zag legs. The sub-model activated by the node $z_1^1$ always starts with node $z_1^2$ producing an observation from the distribution of negative zig-zag $\mat{D} = \{D_1, \dots, D_9 \}$. Next, it transitions to (a) node $z_2^2$ producing a positive zig-zag leg $\mat{U} = \{U_1, \dots, U_9 \}$, or (b) the end node and switches to the second sub-model, landing on node $z_3^2$ and producing a positive zig-zag leg. Restricting transitions to this limited set of movements force alternation between positive and negative zig-zag legs, thus guaranteeing that all possible observation sequences are well behaved. The sub-model belonging to $z_2^1$ has similar but symmetrical behaviour. These two inner models are conditionally independent, an advantage we will take for computational time.
The persistence of price trends vary according to the resolution of the dataset. In low frequency contexts, financial analysts will frequently define short and long term trends based on timespans involving weeks and months of observations. In high frequency trading, however, trends may last from seconds to hours. The main idea behind the conception of the hierarchical model is that top levels may group a sequence of price movements forming a general trend, whereas the bottom nodes allow zig-zags to represent micro trends that may deviate shortly from the main drift. That is, the model accommodates for possibly unequally probable negative price movements in an upwards market as well as positive short trends during a price reversal.
Theoretically, all HHMM can be expressed as an equivalent HMM with possibly sparse initial probability vector and transition matrix. Although learning from this representation may prove less efficient in terms of computational complexity, the relatively simple structure and many restrictions of the model under study make estimation and inference feasible. The equivalent "expanded" HMM has $K = 4$ states, the following $K$-sized initial probability vector
\[
\mat{\pi} =
\begin{bmatrix}
\pi_1 & 0 & 1 - \pi_1 & 0 \\
\end{bmatrix}
\]
and the $K \times K$ transition matrix
\[
\mat{A} =
\begin{bmatrix}
0 & a_{12} & 1 - a_{12} & 0 \\
1 & 0 & 0 & 0 \\
a_{31} & 0 & 0 & 1 - a_{31} \\
0 & 0 & 1 & 0 \\
\end{bmatrix},
\]
where each element $a_{ij}$ represents the probability of transitioning from the hidden state $i$ (row) to $j$ (column) in one step. The matrix is sparse with zeros representing nodes with no direct connections. Since the initial probability vector and the rows of the transition matrix must sum to one, hidden dynamics are governed by only three free parameters.
Production nodes emit one observation from the finite sets of possible outputs $\mat{O}_D$ (for $z_1^2$ and $z_4^2$) and $\mat{O}_U$ (for $z_2^2$ and $z_3^2$). Conditional probability distributions are given by output probability vectors of length $L = 9$ subject to sum-to-one constraints:
\[
\begin{aligned}
\mat{B}^1 &= p(\mat{x}_t | z_1^2) = \left[b^{1}_{D_1}, \dots, b^{1}_{D_9} \right], \\
\mat{B}^2 &= p(\mat{x}_t | z_2^2) = \left[b^{2}_{U_1}, \dots, b^{2}_{U_9} \right], \\
\mat{B}^3 &= p(\mat{x}_t | z_3^2) = \left[b^{3}_{U_1}, \dots, b^{3}_{U_9} \right], \\
\mat{B}^4 &= p(\mat{x}_t | z_4^2) = \left[b^{4}_{D_1}, \dots, b^{4}_{D_9} \right]. \\
\end{aligned}
\]
The observation model has 32 free parameters.
On the whole, the parameter vector for the HMM representation reduces to vector of size $35$,
\[
\mat{\theta} = (\pi_1, a_{12}, a_{31}, b^1_{D_l}, b^2_{U_l}, b^3_{U_l}, b^4_{D_l}),
\]
with $l \in \{1, \dots, L-1\}$.
## Dataset
The original work presents results for both simulated and real data. The latter is based on historical high-frequency time series for the 60 stocks listed in the S&P/TSE60 index. The dataset consists of all 22 business days of May 2007. The author excludes three days due to significant errors without disclosing the exact dates. We confirm that our results are consistent with the original work by focusing on GoldCorp Inc (TSE:G), the only series described exhaustively in the original work.
Note that we do not model prices directly. Instead, non-linear transformations are applied to the trade price and volume series to produce the sequence of features that feeds the proposed model.
## Methodology
\label{sec:methodology}
Model parameters are estimated on a rolling window with five days each. Since top nodes are symmetrical, states are labeled ex-post based on the order of the in-sample mean of the percentage change in the initial and final price before the top level state switch. The state with larger and smaller returns are marked as bullish (a run) and bearish (a reversal) respectively.
<!-- The conditional distribution of the features are studied to confirm that they differ from the unconditional distribution, ie the price process can be split into two regimes. Additionally, the differences are examinated to confirm that they differ in the expected way, ie the bullish state is more likely to have observations with price increases supported by volume increases and price drecreases are not supported by volumen decreases. -->
After learning and labeling, the author runs two out of sample inference procedures on the sixth day. First, offline smoothing infers the hidden state at time $t$ based on the full evidence of the sixth day. Although this quantity is not useful for trading because of its look-ahead bias, it provides an upper bound benchmark for the model. Second, online filtering is used as a trading rule. Although smoothing is a valid benchmark, we focus exclusively on the latter.
Most of the diagnostics are based on trade returns. For the $l$-th top-level state switch, the percentage return is defined as
\[
R_l = \frac{p^e_l - p^s_l}{p^s_l},
\]
where $p^s_l$ and $p^e_l$ are the price at the start and end of the switch.
The information content of learning the top-level state is assessed by comparing the unconditional empirical distribution of trade returns versus their empirical conditional distribution given the top state. Additionally, regime return characteristics are validated: mean trade returns are expected to be higher for bullish regimes compared to bearish regimes, and they are expected to be positive and negative for runs and reversals respectively. In the original work, most of these analysis are run both in-sample and out-of-sample. We focus on out-of-sample results only.
Finally, a trading strategy is tested. After a zig-zag leg is completed, the trading system buys one unit every time the top-level state switches to bullish and sells one unit every time it switches to bearish. As an addition to the original research, where trades are executed one tick after the leg is observed to ensure that there is no lock-ahead bias, we investigate the decay of the strategy performance for longer lags.
## GoldCorp Inc (TSE:G)
```{r}
# Data loading ------------------------------------------------------------
data.env <- new.env()
series <- do.call(rbind, lapply(data.files, function(f) {
data.name <- load(file = f, envir = data.env)
data.var <- get(data.name, data.env)
attr(data.var, 'symbol') <- data.name
indexTZ(data.var) <- 'America/Toronto'
return(data.var)
}))
rm(data.env)
tdata <- na.omit(series[, 1:2])
colnames(tdata) <- c("PRICE", "SIZE")
```
We present an in-depth study of one stock to assess the strengths and weaknesses of the model. Using the tick-by-tick series of GoldCorp Inc (TSE:G), we split our dataset in training (2007-05-04 to 2007-05-10 - five trading days) and test (2007-05-11) sets. Next, we run our procedure in a walking forward fashion for this stock as well as others and present some summary statistics for the performance of the strategy.
### Data exploration
```{r}
# Feature extraction ------------------------------------------------------
system.time(zig <- extract_features(tdata, features.alpha))
zig.ins <- zig[ins]
zig.oos <- zig[oos]
```
We center our attention on the sequence of trades, disregarding possibly valuable information from the bid and ask series. Future research may employ such information to improve model predictability. We start by extracting the features using the procedure detailed above. We set the threshold for the change in volume indicator variable in $\alpha = 0.25$ as suggested by the author of the original work. In-sample dataset reduced to `r sprintf("%0.0f", nrow(zig.ins))` zig-zags.
Apart from the zig-zags themselves, local extrema are interesting on their own. Although we could not gain insight by visually inspecting other intermediate indicators such as the trend $f_1$, further research may find value in them.
```{r, include = TRUE, out.width = '\\textwidth', fig.cap = "Local extrema detected in TSE:G 2007-05-11 10:30:00/2007-05-11 11:30:00. \\label{tseg-ins-extrema}"}
ss <- '2007-05-11 10:30:00/2007-05-11 11:30:00'
setglobals()
plot_features(tdata[ss, ], zig[ss, ], which.features = 'extrema')
```
```{r, include = TRUE, out.width = '\\textwidth', fig.cap = "Features extracted from TSE:G 2007-05-11 10:30:00/2007-05-11 11:30:00. \\label{tseg-ins-features}"}
setglobals()
plot_features(tdata[ss, ], zig[ss, ], which.features = 'all')
```
### Estimation
```{r}
T.ins <- nrow(zig.ins)
x.ins <- as.vector(zig.ins$feature)
T.oos <- nrow(zig.oos)
x.oos <- as.vector(zig.oos$feature)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
stan.model = 'stan/hhmm-tayal2009-lite.stan'
stan.data = list(
T = T.ins,
K = K,
L = L,
sign = ifelse(x.ins < L + 1, 1, 2),
x = ifelse(x.ins < L + 1, x.ins, x.ins - L),
T_oos = T.oos,
sign_oos = ifelse(x.oos < L + 1, 1, 2),
x_oos = ifelse(x.oos < L + 1, x.oos, x.oos - L))
# A naive implementation for a stan cache
cache.objects <- list(series, features.alpha,
stan.model, readLines(stan.model),
stan.data, n.iter, n.warmup,
n.thin, n.chains, n.cores, n.seed)
cache.digest <- digest(cache.objects)
cache.filename <- file.path(cache.path, paste0(cache.digest, ".RDS"))
if (!is.null(cache.path) & file.exists(cache.filename)) {
stan.fit <- readRDS(cache.filename)
} else {
stan.fit <- stan(file = stan.model,
model_name = stan.model,
data = stan.data, verbose = T,
iter = n.iter, warmup = n.warmup,
thin = n.thin, chains = n.chains,
cores = n.cores, seed = n.seed)
if (!is.null(cache.path))
saveRDS(stan.fit, cache.filename)
}
rm(cache.objects, cache.digest, cache.filename)
n.samples = (n.iter - n.warmup) * n.chains
```
Model parameters can be segregated into two groups: parameters for the hidden model, including the initial distribution vector as well as the transition matrix, and parameters for the conditional multinomial distributions for the output. Table \ref{tab:tseg-ins-transition} summarizes the former. The estimates for the initial distribution probabilities $\pi_1$ and $1 - \pi_1$ are uncertain to a large extent, which is unsurprising as the sample provides with only one starting observation and the model imposes almost no prior information for the parameters. @team2017stan, a technical manual for a programming language that also contains many brief discussion of statistical notions, describes some alternative specifications. If the sample were conceived as a subsequence of a long-running data generating process, the initial probabilities may be set to equal the stationary state probabilities of the transition Markov chain. Contrarily, if the sample was considered a finite-length sequence, the model may have a different starting distribution.
On the other hand, estimation of the transition parameters profit from a larger number of trades: more information reduce uncertainty. We note that the bull top state seems persistent as positive zig-zags $z^2_3$ are more likely to transition to bull negative zig-zags $z^2_4$ versus bear negative zig-zags $z^2_1$ by a factor of $a_{34} / a_{31} \approx 10$. After studying several stocks and timespans, we decide that these empirical observations are specific to the sample and are highly influenced by the general price trend present in the five-day training dataset.
```{r, include = TRUE, results="asis"}
mat <- summary(stan.fit, pars = c('p_1k', 'A_ij'),
probs = c(0.10, 0.50, 0.90))$summary[, c(1, 3, 4, 5, 6)]
mat <- mat[mat[, 5] != 0, ]
colnames(mat) <- c('Mean', 'Std. Deviation', '$q_{10\\%}$', '$q_{50\\%}$', '$q_{90\\%}$')
rownames(mat) <- c('$\\pi_{1}$', '$1 - \\pi_{1}$',
'$a_{12}$', '$1 - a_{12}$',
'$a_{21}$',
'$a_{31}$', '$1 - a_{31}$',
'$a_{43}$')
setglobals()
tab <- xtable(mat,
caption = "Estimated parameters of the transition matrix
for TSE:G 2007-05-04 09:30:00/2007-05-10 16:30:00.",
label = "tab:tseg-ins-transition",
align = c('r', rep('R', 5)))
print(tab, digits = 4,
width = "0.65 \\textwidth", size = "\\footnotesize")
```
Output distributions are summarized in Figure \ref{fig:tseg-ins-multinomial}. It is important to note that zig-zags with no price trends $D_4, D_5, D_6, U_4, U_5, U_6$ are the most preeminent features. In particular, it is worthwhile to analyse the behaviour of the model in the presence of local volatility $D_5, U_5$, i.e. when there is no clear trends in price and volume.
```{r, include = TRUE, out.width = '\\textwidth', fig.height = 2.5, fig.cap = "Estimated parameters of the conditional multinomial distribution of the outputs given the emission state (bottom node) for TSE:G 2007-05-04 09:30:00/2007-05-10 16:30:00. \\label{fig:tseg-ins-multinomial}"}
parplot <- function(mat, y.axislab = NULL, xlim = NULL, ...) {
if (ncol(mat) != 5)
stop("Please provide a mat with five columns: 1 and 5 for the long line, 2 and 4 for the short line and 3 for the central dot.")
if (is.null(xlim))
xlim <- round(c(min(mat), max(mat)), 0)
y <- nrow(mat):1
plot(NULL,
xlim = xlim, ylim = round(c(min(y), max(y)), 0),
yaxt = 'n',
...)
if (!is.null(y.axislab))
axis(2, at = y, labels = y.axislab, las = 1)
arrows(mat[, 1], y, mat[, 5], y,
angle = 90, code = 3,
col = 'black', lwd = 1, length = 0.04)
arrows(mat[, 2], y, mat[, 4], y,
angle = 90, code = 0,
col = 'gray', lwd = 3, length = 0.04)
points(x = mat[, 3], y = y, cex = par()$cex * 0.65,
pch = 21, bg = 'black', col = 'black')
}
phi_k <- extract(stan.fit, pars = 'phi_k')[[1]]
mat <- apply(phi_k, c(2, 3), function(p) {c(min(p), quantile(p, c(0.10, 0.50, 0.90)), max(p))})
subs <- c('Bear - Negative zig-zag', 'Bear - Positive zig-zag', 'Bull - Positive zig-zag', 'Bull - Negative zig-zag')
setglobals()
par(mfrow = c(1, K))
layout(matrix(1:K, TRUE), widths = c(0.22, rep(0.22, 3)))
for (k in 1:K) {
par(mar = c(3.1, 3.1, 4.1, 0))
y.axislab <- paste("Phi", 1:9)
if (k != 1) {
par(mar = c(3.1, 0, 4.1, 0))
y.axislab <- NULL
}
# par(oma = c(0, 1, 0, 0))
parplot(t(mat[, k, ]),
main = bquote(atop('Production state' ~ .(k),
.(subs[k]))),
y.axislab = y.axislab,
xlim = c(0, 1), xlab = "", ylab = "")
}
```
We first study short-term price changes in the opposite direction to the long-term trend. The vast majority of negative zig-zags observed in bullish markets are due to local volatility ($\hat{\phi}_{45}$ = 0.88) as do most of the positive zig-zags found in bearish markets ($\hat{\phi}_{25}$ = 0.80). We note, however, that negative legs with no price trends but weakening or strengthening trade volume are equally probable in bearish markets ($\hat{\phi}_{14} \approx \hat{\phi}_{16}$). Analogously, we find in bullish markets that $\hat{\phi}_{34} \approx \hat{\phi}_{36}$. These observations counter the a priori classification stated in Table \ref{tab:feature-space}.
Moreover, we find that positive zig-zags originated in local volatility are more likely to be seen in bearish markets versus bullish markets by a factor of $\phi_{25} / \phi_{35} \approx 4$. The odds for negative zig-zags are similar. We warn the reader that the current model needs more information to classify this feature. All in all, we are warned that zig-zag direction $f^0$ and change in volume $f^2$ are not decisive without a price trend, a hint that the current model needs either better feature engineering rules for the change in volume or more external information to deal with local volatility.
### Convergence
Figure \ref{tseg-ins-diags} illustrates the trace plot of some arbitrary parameters as well as some diagnostic measures. In general terms, mixing and convergence to the stationary distribution is acceptable. The shrink factor of @gelman1992inference, close to $1$ for all hidden quantities, suggest an adequate degree of convergence. Sampling is efficient as signaled by effective sample size ratios near to $1$ and Monte Carlo Standard Error to posterior standard deviation ratios well below 10%.
```{r include = TRUE, out.width = '\\textwidth', fig.height = 4, fig.cap = "Traceplot of some arbitrarily selected parameters and histograms of diagnostic measures. Mixing, convergence to the stationary distribution and sampling efficiency are acceptable.\\label{tseg-ins-diags}"}
quickplot <- function(x, ...) {
plot(x, type = 'l', col = 'lightgray', xlab = "Iteration", bty='l', ...)
abline(h = median(x), col = 'gray')
}
quickhist <- function(x, ...) {
hist(x, breaks = "FD", col = 'lightgray', border = 'gray', ...)
}
setglobals()
par(mfrow = c(2, 3))
quickplot(extract(stan.fit, pars = 'p_11')[[1]], main = expression(pi[1]), ylab = "")
quickplot(extract(stan.fit, pars = 'A_ij')[[1]][, 1, 2], main = expression(a[12]), ylab = "")
quickplot(extract(stan.fit, pars = 'phi_k')[[1]][, 1, 5], main = expression(phi[1*","*5]), ylab = "")
quickhist(summary(stan.fit)[[1]][, 'Rhat'], main = expression(hat(R) ~ "statistic"), xlab = "")
quickhist(summary(stan.fit)[[1]][, 'n_eff'] / n.samples, main = expression("Effective sample size (%)"), xlab = "")
quickhist(summary(stan.fit)[[1]][, 'se_mean'] / summary(stan.fit)[[1]][, 'sd'], main = expression("Monte Carlo SE / SD"), xlab = "")
```
Although reestating the original Hierarchical Hidden Markov Model into a Hidden Markov Model may increase time and memory complexity, the new model becomes significantly easier to program and convergence. A full list of our results, including convergence statistics such as $\hat{R}$ and the effective sample size, is included in the Appendix Section \ref{sec:appendix-tseg-parameters}.
### State probability
```{r, include = FALSE}
# Extraction
alpha_tk.ins <- extract(stan.fit, pars = 'alpha_tk')[[1]]
alpha_tk.oos <- extract(stan.fit, pars = 'alpha_tk_oos')[[1]]
zstar_t.oos <- extract(stan.fit, pars = 'zstar_t')[[1]]
# Hard classification
state.filtered.ins <- apply(apply(alpha_tk.ins, c(2, 3), median), 1, which.max)
state.filtered.oos <- apply(apply(alpha_tk.oos, c(2, 3), median), 1, which.max)
state.filtered <- c(state.filtered.ins, state.filtered.oos)
state.viterbi.oos <- apply(zstar_t.oos, 2, median)
# Top state classification ------------------------------------------------
topstate.labels <- c("Bear", "Bull")
topstate.pairs <- list(c(1, 2), c(3, 4)) # bottom-node to top-node map
topstate.index <- c(state.bear, state.bull)
# assign zig-zag to top states
zig$topstate <- state.filtered
for (i in 1:length(topstate.pairs))
zig$topstate[state.filtered %in% topstate.pairs[[i]]] <- topstate.index[i]
zig$topstate.chg <- zig$topstate != lag(zig$topstate)
zig$topstate.chg[1] <- TRUE
# build top sequence
top <- zig[zig$topstate.chg == TRUE, ]
top$end <- c(as.numeric(tail(top$start, -1)) - 1, last(zig$end))
top$len <- top$end - top$start
top$ret <- (as.numeric(tdata[top$end, 1]) - as.numeric(tdata[top$start, 1])) / as.numeric(tdata[top$start, 1])
print(sprintf("%% BEFORE AN EVENTUAL SWITCH:
bottom nodes 1, 2 (bear) have a mean return of %0.4f%%,
while bottom nodes 3, 4 (bull) have a mean return of %0.4f%%.",
100 * mean(top$ret[top$topstate == state.bear]),
100 * mean(top$ret[top$topstate == state.bull])))
# label top nodes
if (mean(top$ret[top$topstate == state.bear]) > mean(top$ret[top$topstate == state.bull])) {
top$topstate <- ifelse(top$topstate == state.bear, state.bull, state.bear)
zig$topstate <- ifelse(zig$topstate == state.bear, state.bull, state.bear)
print(sprintf("%%I identified top-nodes as bears and bulls.
Result: Bear = %0.4f%% vs Bull = %0.4f%%",
mean(top$ret[top$topstate == state.bear]),
mean(top$ret[top$topstate == state.bull])))
}
tdata <- xts_expand(tdata, zig[, c('feature', 'topstate')])
tdata.ins <- tdata[ins]
tdata.oos <- tdata[oos]
zig.ins <- zig[ins]
zig.oos <- zig[oos]
top.ins <- top[ins]
top.oos <- top[oos]
```
The forward algorithm allows us to calculate the filtered belief state: the probability that an observation at time $t$ was emitted by one of the possible four states (bottom-nodes) given the evidence available up to $t$. We assign each observation to the emission state with largest filtered probability and, by the definition of the hierarchical model, they become naturally linked to one of the two possible top states (bears and bulls). Figure \ref{tseg-ins-features-hist} reflects the resulting classification.
```{r include = TRUE, out.width = '\\textwidth', fig.height = 2.5, fig.cap = "Distribution of features conditional on the estimated hidden regime (top node). \\label{tseg-ins-features-hist}"}
setglobals()
plot_topstate_features(zig.ins$feature, zig.ins$topstate, L = 9,
topstate.label = c("bear", "bull"))
```
The following table provides some summary statistics for the returns of the observations classified in each state. The structure of the top nodes are symmetrical and they do not have an a priori order. We label them according to the in-sample mean trade returns.
```{r, include = TRUE, results = "asis"}
mat <- t(round(topstate_summary(top.ins), 4))
colnames(mat) <- c('Mean', 'SD', 'Skewness', 'Kurtosis',
'$q_{25}\\%$', '$q_{50}\\%$', '$q_{75}\\%$',
'Mean length', 'Median length')
rownames(mat) <- c('Bear', 'Bull', 'Unconditional')
setglobals()
tab <- xtable(mat,
caption = "Summary statistics for the return of the trades
assigned to each of the two possible top states for
TSE:G 2007-05-04 09:30:00/2007-05-10 16:30:00. Trade returns
are computed as defined in Section \\ref{sec:methodology}.
Trade length is computed as the number of ticks involved.
Returns expressed in percentage. SD means Standard Deviation.",
label = "tab:tseg-ins-topstate",
align = c('r', rep('R', 9)))
print(tab, digits = 4,
width = "\\textwidth", size = "\\footnotesize")
```
Bull top states have a greater mean return than bear top state by construction. As it is also visible in Figure \ref{fig:tseg-ins-ret-hist}, a positive skewness coefficient for all states indicates that negative returns tend to overweight positive returns for this specific stock during the five-day in-sample dataset. However, bear markets have a marked skew towards negative returns and become more risky in term of extreme events (higher kurtosis).
```{r include = TRUE, out.width = '\\textwidth', fig.height = 2.5, fig.cap = "Conditional distribution of trade returns given the estimated top state in TSE:G 2007-05-04 09:30:00/2007-05-10 16:30:00. \\label{fig:tseg-ins-ret-hist}"}
setglobals()
par(mar = c(3.5, 4.1, 1.0, 2.1))
plot_topstate_hist(top.ins$ret, top.ins$topstate,
main.lab = "Return", x.lab = "",
topstate.label = c("bear", "bull"))
```
We remark that trade returns originated in different stocks or timespans do not necessarily share these characteristics. Location, dispersion, symmetry and shape of the return distribution vary along stocks and day of analysis.
### Fitted output
As mentioned in Section \ref{sec:model}, the model assumes that outputs are emitted by one of four possible bottom nodes. By definition, negative legs belong to either $z_{1}^2$ or $z_{4}^2$ while positive legs belong to $z_{2}^2$ or $z_{3}^2$. Once the parameters are estimated, states $z_{1}^2$ and $z_{2}^2$ are labeled as bears while $z_{3}^2$ and $z_{4}^2$ are marked as bulls according to the mean trade return of the observations belonging to each top node.
Bullish states allow for negative zig-zags and bearish states allow for positive zig-zags as long as the trade volume is indeterminant or weak. The results of the classification are summarized in Table \ref{tab:tseg-filtered-ins}. As expected, the bullish top-node capture positive movements due to local volatility as well as downward movements with weak volume.
We remark that all observations belonging to each of the 18 possible features are imputed to one state. For example, observations $U_3$ are all mapped into state $z^2_4$. In a sense, hard classification suffers from information loss. This requires further research as may be seen as a weakness of the inference procedure. Nonetheless, in-sample classification results do not look alarming in Figure \ref{fig:tseg-ins-seqv}.
```{r, include = TRUE, results = "asis"}
mat <- t(rbind(
c(rep("Bear", 2), rep("Bull", 2)),
1:4,
round(as.matrix(table(x.ins, state.filtered.ins)), 0)))
colnames(mat) <- c("Top", "Bottom",
apply(expand.grid(1:9, c("U", "D")), 1,
function(x) { sprintf("$%s_{%s}$", x[2], x[1])}))
setglobals()
tab <- xtable(mat,
caption = "Features extracted are classified as emitted by one
of the four possible bottom-nodes according to the filtered
probability. TSE:G 2007-05-04 09:30:00/2007-05-10 16:30:00.",
label = "tab:tseg-filtered-ins",
align = c(rep('r', 2), rep('R', 19)))
print(tab, digits = 4, include.rownames = FALSE,
width = "\\textwidth", size = "\\scriptsize")
```
Now, look how at how the features are classified: Mention plot number on tayal.
```{r include = TRUE, out.width = '\\textwidth', fig.cap = "Tick by tick sequence of trades, classified as belonging to the bear or the bull top state, from TSE:G 2007-05-10 09:30:00/2007-05-10 10:30:00. \\label{fig:tseg-ins-seqv}"}
source('R/state-plots.R')
setglobals()
ss <- '2007-05-10 09:30:00/2007-05-10 10:30:00'
plot_topstate_seqv(tdata.ins[ss, ], zig.ins[ss, ],
main.lab = sprintf("%s In-sample [%s]", attr(tdata.oos, 'symbol'), ss))
```
### Trading Strategy
Using the parameters estimated from the five days of training data, we run the trading strategy out of sample during one day. Figure \ref{fig:tseg-ins-seqv} depicts the equity line for 2007-05-10 while Table \ref{tab:appendix-wf-tseg} summarises the returns for a trading month. The HHMM strategy can produce a positive result in a day with a downward price trend. Positive returns can be achieved in the presence of execution lag, even though trading results are highly sensitive. In many cases, the strategy does not only contribute to the generation of profits but also to risk reduction.
\blandscape
\newpage
```{r include = TRUE, warning = FALSE, out.width='1\\linewidth', fig.width=14, fig.height=8, fig.cap = "Out of sample equity line for TSE:G 2007-05-10 09:30:00/2007-05-10 10:30:00. \\label{fig:tseg-ins-seqv}"}
setglobals()
trades.oos <- topstate_trading(tdata.oos, 1)
ltrades.oos <- lapply(1:5, function(i) {topstate_trading(tdata.oos, i)})
plot_topstate_trading(tdata.oos, zig.oos, ltrades.oos)
```
\elandscape
```{r}
trades.all <- readRDS('fore_cache/tr_all.RDS')
```
```{r, include = TRUE, results = "asis"}
filter.in <- c('G.TO')
task.filter <- sapply(trades.all, function(trade) {
any(grepl(paste(filter.in, collapse = '|'), trade$symbol))
})
trades <- trades.all[task.filter]
mat <- t(sapply(trades, function(trade) {
c(buyandhold = prod(1 + trade$trades$buyandhold) - 1,
sapply(trade$trades[-1], function(strat) {
prod(1 + strat$ret) - 1
}))
}))
colnames(mat) <- c("Buy-Hold", sprintf("HHMM (%s lag)", 0:5))
rownames(mat) <- sapply(trades, function(trade) { substr(trade$oos, 1, 10) })
setglobals()
mat.ext <- apply(mat, 2, function(m) {
c(total = prod(1 + m) - 1,
min = min(m),
mean = mean(m),
med = median(m),
max = max(m),
sd = sd(m))})
rownames(mat.ext) <- c("Total", "Min", "Mean", "Median", "Max", "SD")
tab <- xtable(round(100 * rbind(mat, mat.ext), 4),
caption = "Compound daily return originated in the HHMM trading strategy for different levels of lags. Lag measured in ticks between the end of the zig-zag and the execution of the trade (zero lag suffers from look-ahead bias). TSE:G.",
label = "tab:appendix-wf-tseg",
align = c('r', rep('R', 7)))
print(tab, digits = 6, hline.after = c(-1, 0, nrow(mat), nrow(mat) + nrow(mat.ext)),
table.placement = "H", width = "\\textwidth", size = "\\tiny")
```
### Application to other stocks
We run the walk forward backtesting procedure for twelve other stocks: `r paste(unique(sapply(trades.all, function(t) { t$symbol })), sep = ", ")`. Using 17 days of data, we train the model on a five-day rolling window and then apply it to trade out of sample during a day. We collect the returns from seven configurations, including the buy & hold and the HHMM strategies with none to five ticks of lags (we remind the reader that no lag implies look-ahead bias). In total, we compute $12 \times 17 \times 7 = 1,428$ daily returns. As the correlation matrix in Table \ref{tab:all-oos-summary} shows, the HHMM strategy is virtually uncorrelated with buy and hold. Section \ref{sec:appendix-all-wf} in the appendix details the trade returns and equity line for the backtested stocks.
```{r}
mat <- t(sapply(trades.all, function(trade) {
c(buyandhold = prod(1 + trade$trades$buyandhold) - 1,
sapply(trade$trades[-1], function(strat) {
prod(1 + strat$ret) - 1
}))
}))
colnames(mat) <- c("Buy-Hold", sprintf("HHMM (%s lag)", 0:5))
```
```{r, include = TRUE, results = "asis"}
mat.ext <- apply(mat, 2, function(m) {
c(min = min(m),
mean = mean(m),
med = median(m),
max = max(m),
sd = sd(m),
iqr = IQR(m))})
rownames(mat.ext) <- c("Min", "Mean", "Median", "Max", "SD", "IQR")
setglobals()
tab <- xtable(round(100 * mat.ext, 6),
caption = "Summary statistics of daily return originated in the HHMM trading strategy for different levels of lags. Returns expressed as percentages.",
label = "tab:all-oos-summary",
align = c('r', rep('R', 7)))
print(tab, digits = 6,
table.placement = "H", width = "\\textwidth", size = "\\small")
```
```{r, include = TRUE, results = "asis"}
setglobals()
tabs <- xtable(cor(mat),
caption = "Correlation matrix of daily return originated in the HHMM trading strategy for different levels of lags.",
label = "tab:all-oos-cor",
align = c('l', rep('R', 7)))
print(tabs, colnames.format = "single",
digits = 4, NA.string = 'n.a.',
table.placement = "H",
width = "\\textwidth", size = "\\small")
```
## Discussion {#further-research}
### The statistical model
We find that the model proposed, while statistically simple, is highly expressive of financial domain knowledge. The hierarchical design creates a multi-level stochastic process that learns autocorrelations in different time scales. This accommodates for multi-resolution, a typical characteristic of financial datasets. We remark that, whereas HHMM offer a methodology to create a complex, highly non linear model for time series, estimation and inference can be achieved in reasonable complexity. The developments by @murphy2001linear simplified cubic time into linear time inference, making inference feasible for high-frequency finance.
### The financial application
We have special interest in the feature extraction procedure. They were cleverly designed to reproduce some of the most basic principles of technical analysis and, when applied to real data, they proved to be a powerful descriptor of price and volume movements. Nonetheless, we observe that the change in volume is the weakest component and provides with great opportunities for further enhancements. The contribution of the author should not be neglected: in general terms, the features are the most important factor in the success or the failure of a machine learning project [@domingos2012few].
---
# References
<div id="refs"></div>
---
# Appendix
## Estimated parameters TSE:G 2007-05-04/2007-05-10.
\label{sec:appendix-tseg-parameters}
```{r, include = TRUE, results = "asis"}
pars <- c('p_1k', 'A_ij', 'phi_k')
rows <- list(
sprintf("$\\pi_{%s}$", 1:4),
sprintf("$a_{%s}$",
apply(expand.grid(1:4, 1:4), 1, function(x) { paste0(x[2], x[1]) })),
sprintf("$\\phi_{%s}$",
apply(expand.grid(1:9, 1:4), 1, function(x) { paste0(x[2], x[1]) }))
)
prob <- c(0.025, 0.10, 0.25, 0.50, 0.75, 0.90, 0.975)
lmat <- lapply(1:length(pars), function(i) {
p <- pars[i]
mat <- summary(stan.fit, pars = p,
probs = prob)$summary
colnames(mat) <- c("Mean", "MCSE", "SD",
sprintf("$q_{%0.1f\\%%}$", 100 * prob),
"ESS", "$\\hat{R}$")
rownames(mat) <- rows[[i]]
mat
})
setglobals()
tabs <- xtableList(lmat,
caption = "Statistics summary for the distributions of
estimated parameters for TSE:G 2007-05-04
09:30:00/2007-05-10 16:30:00. MCSE means Monte Carlo
Standard Error, SD means (posteriori) Standard Deviation
and ESS means Effective Sample Size.",
label = "tab:tseg-ins-appendix-summary",
align = c('l', rep('R', 12)))
print(tabs, colnames.format = "single",
digits = 4, NA.string = 'n.a.',
table.placement = "H",
width = "\\textwidth", size = "\\footnotesize")
```
\newpage
## Out of sample trading performance for some selected assets
\label{sec:appendix-all-wf}
```{r, include = FALSE}
syms <- unique(sapply(trades.all, function(t) { t$symbol }))
out <- NULL
for (sym in syms) {
task.filter <- sapply(trades.all, function(trade) {
any(grepl(paste(sym, collapse = '|'), trade$symbol))
})
trades <- trades.all[task.filter]
mat <- t(sapply(trades, function(trade) {
c(buyandhold = prod(1 + trade$trades$buyandhold) - 1,
sapply(trade$trades[-1], function(strat) {
prod(1 + strat$ret) - 1
}))
}))
colnames(mat) <- c("Buy-Hold", sprintf("HHMM (%s lag)", 0:5))
rownames(mat) <- sapply(trades, function(trade) { substr(trade$oos, 1, 10) })
out <- c(out, knitr::knit_child('Rmd/appendix-wf.Rmd'))
}
```
`r paste(out, collapse='\n')`
## Original Computing Environment
```{r, include = TRUE}
writeLines(readLines(file.path(Sys.getenv("HOME"), ".R/Makevars")))
devtools::session_info("rstan")
```