forked from laurentnoe/iedera
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautomaton.hh
5324 lines (4443 loc) · 199 KB
/
automaton.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#ifndef __AUTOMATON_HH__
#define __AUTOMATON_HH__
/** @page automata Automata
* @brief Automata description and functions
* @tableofcontents
*
* @section automata-description Description
* This part describes an automaton\<T\> : each @ref automaton<T> is mainly a represented by a set of @ref state<T>, each itself being represented by a set of @ref transition<T>.
*
* @li An automaton\<T\> can be deterministic or not.
* @li It may bring probabilities (T = double, T = polynomial\<long long int\>), costs (T = cost\<int\>), counts (T = unsigned long long), or nothing (T = void).
*
* For an automaton, its set of states is stored in the @ref automaton::_states vector. For each state, its set of transitions is stored in the @ref state::_next vector of vectors, the outer vector being of size @ref gv_align_alphabet_size, each state has an extra @ref state::_final integer to mark a possible final flag or final value. Each transition brings a destination state number @ref transition::_state as an integer, and its transition probability / cost @ref transition::_prob as a templated \<T\> element
*
* By default the automaton\<T\> constructor is almost empty (it creates only a final state 0 and the init state 1), but several methods are proposed to construct @ref seed-automaton, @ref probabilistic-automaton, @ref structural-automaton (@ref automaton-construction). Several methods are also proposed to manipulate theses automata (@ref automaton-manipulate), compute properties (@ref automaton-computed-properties), convert them into matrices (@ref automaton-matrix-conversion),
*
* @section automaton-construction Construction
*
* @subsection seed-automaton seed automaton
*
* Three seed models are supported with their associated automata : subset seeds (built with the method @ref automaton::Automaton_SeedPrefixesMatching with possible several overlaps between occurences, with the method @ref automaton::Automaton_SeedPrefixesMatching_CoverageDM for global coverage measure, or with the method @ref automaton::Automaton_SeedLinearMatching for linear measure without overlaps), vectorized seeds (built with a Aho-Corasick like @ref automaton::Automaton_SeedBuhler), or more generally vectorized subset seeds (built with the combined @ref automaton::Automaton_SeedScore).
*
* The "Cost" variants of the subset seed model (@ref automaton::Automaton_SeedPrefixesMatchingCost) and the vectorized subset seed model (@ref automaton::Automaton_SeedScoreCost) are proposed variants that prune the automaton when Lossless construction set is used (@ref gv_lossless_flag).
*
* @subsection probabilistic-automaton probabilistic automaton
*
* Two models are directly supported : Bernoulli (@ref automaton::Automaton_Bernoulli) and Markov (@ref automaton::Automaton_Markov). Others can be read via the automaton::operator>>().
*
* @subsection structural-automaton structural automaton
*
* Three structural models are proposed for various objectives : The homogeneous alignments (@ref automaton::Automaton_Homogeneous) are proposed to guaranty that any sub-part of the alignment cannot have a higher score than the full one; To set seeds on a (cyclic) set of positions, the cycle automaton is used (@ref automaton::Automaton_Cycle) ; To get the automaton of alignments that must be rejected by the lossless constraints, the lossless automaton is used (@ref automaton::Automaton_Lossless) ; To get the automaton that counts a set of matching symbols, the symbols-counting automaton (@ref automaton::Automaton_CountAlphabetSymbols) can be used.
*
* You can find more details for Homogeneous alignments :
*
* Estimating seed sensitivity on homogeneous alignments
*
* You can find more details for Coverage automaton :
*
* A coverage criterion for spaced seeds and its applications
* to SVM string kernels and k-mer distances
*
* Additional files are also provided at @ref https://bioinfo.lifl.fr/yass/iedera_coverage/index.html and at @ref https://bioinfo.lifl.fr/yass/iedera_coverage/index_additional.html
*
* @section automaton-manipulate Manipulation
*
* Several methods are proposed to manipulate or combine automata. The more general is the product (@ref automaton::product), that realizes the union, intersection, exclusion, with possible effect on merging final states. The second one ask to compute the @f$m@f$ occurrence of a final state to accept the alignment, and is thus useful for multiple hits and also global coverage measure (@ref automaton::mhit). The last one builds the minimization of the current automaton with the Hopcroft algorithm (@ref automaton::Hopcroft), respecting different labelled final states.
*
* @section automaton-computed-properties Computing properties
*
* From the probabilities attached to each transition (@ref automaton::Prob), it is possible to compute the probability (T = double, T = polynomial\<long long int\>), cost (T = cost\<int\>) or count (T = unsigned long long) to be at any final state after some @f$n@f$ steps without any restriction (@ref automaton::Pr), or with the restriction to be in the lossless accepted language (@ref automaton::PrLossless).
*
* @section automaton-matrix-conversion Converting into matrices
* Several methods are proposed to convert the @f$ Automaton \times ProbabilisticModel @f$, @f$ Automaton \times CostModel @f$, or @f$ Automaton \times CountModel @f$ into matrices for more convenient computations. The probability @ref automaton::matrix_pr_product , the cost @ref automaton::matrix_cost_product and the counting @ref automaton::matrix_count_product give the product of two automata into a resulting matrix @f$M@f$ (that store either probabilities, costs, or counts) so that @f$M^n@f$ usually compute the needed properties. There are also three "stepwise equivalent" methods @ref automaton::matrices_step_pr_product, @ref automaton::matrices_step_cost_product, and @ref automaton::matrices_step_count_product : these three methods give the "breadth first" product as an ordered set of matrices @f$M_1,M_2,M_3\ldots,M_l@f$, thus enabling any computation @f$M_i,M_{i+1}\ldots,M_{j}@f$ @f$\forall 0 \leq i < j \leq l@f$ @see matrix @see matrices_slicer .
*
* @todo FIXME : to be continued
*/
/** @defgroup automaton automaton class templates
* @brief automaton with additional semi-rings templates for each transition
* @see polynomial,infint,cost
*/
// @{
/// If both USE_TYPE_TRAITS and USE_TR1_TYPE_TRAITS are not defined by ./configure, try to define classical TYPE_TRAITS
#if !defined(USE_TYPE_TRAITS) && !defined(USE_TR1_TYPE_TRAITS)
#define USE_TYPE_TRAITS
#endif
/// If both HAS_STD_TYPE_TRAITS and HAS_STD_TR1_TYPE_TRAITS are not defined by ./configure, try to define classical STD
#if !defined(HAS_STD_TYPE_TRAITS) && !defined(HAS_STD_TR1_TYPE_TRAITS)
#define HAS_STD_TYPE_TRAITS
#endif
//STL
#include <vector>
#include <queue>
#include <stack>
#include <functional>
#include <map>
//STD
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <string>
//STR1
#ifdef USE_TYPE_TRAITS
#include <type_traits>
#else
#ifdef USE_TR1_TYPE_TRAITS
#include <tr1/type_traits>
#else
#error <type_traits> or <tr1/type_traits> not selected in "matrix.hh" : try g++ ... -DUSE_TR1_TYPE_TRAITS or -DUSE_TYPE_TRAITS, and use g++ version >= 4.3 with "-std=c++0x" or "-std=gnu++0x"
#endif
#endif
#ifdef HAS_STD_TYPE_TRAITS
using namespace std;
#else
#ifdef HAS_STD_TR1_TYPE_TRAITS
using namespace std::tr1;
#else
#error std or std::tr1 not selected in "matrix.hh" : try g++ ... -DHAS_STD_TYPE_TRAITS or -DHAS_STD_TR1_TYPE_TRAITS, and use g++ version >= 4.3 with "-std=c++0x" or "-std=gnu++0x"
#endif
#endif
//STR
#include "macro.h"
#include "seed.h"
//STM
#include "matrix.hh"
//#define ASSERTB
//#define BUILD
#ifdef ASSERTB
#include <assert.h>
#endif
/// check if only one transition from this state s and for this letter a
#define DETERMINISTIC(s,a) (this->_states[s]._next[a].size() == 1)
/// check if alignment symbol a is matched by the seed symbol b (lookup table)
#define MATCHES_AB(a,b) (matchingmatrix[(a)][(b)])
/// check the score value for alignment symbol a and seed symbol b (lookup table)
#define SCORES_AB(a,b) (scoringmatrix[(a)][(b)])
/** @brief Define the product families of operators : this is used when performing a product of two automata.
*/
typedef enum ProductSetFinalType {
PRODUCT_UNION_FINAL_LOOP,
PRODUCT_UNION_NO_FINAL_LOOP,
PRODUCT_UNION_NO_FINAL_LOOP_ADD,
PRODUCT_INTERSECTION_FINAL_LOOP,
PRODUCT_INTERSECTION_NO_FINAL_LOOP,
PRODUCT_BUTNOT_FINAL_LOOP,
PRODUCT_BUTNOT_NO_FINAL_LOOP,
PRODUCT_NOTBUT_FINAL_LOOP,
PRODUCT_NOTBUT_NO_FINAL_LOOP,
PRODUCT_ADDHOC_FINAL_LOOP,
PRODUCT_ADDHOC_NO_FINAL_LOOP,
} ProductSetFinalType;
/** @brief Define the final value when crossing of a couple of final states values : this is used when performing a product of two automata.
*/
typedef int (*AddHoc_Final_Func)(int, int);
/** @brief Define the binary word used for jokers (and their relatives) elements when matching of not in the seed.
*/
typedef long long int Seed_X_Word;
/// final
#define TRUE 1
/// non final
#define FALSE 0
template<typename T> class automaton;
template<typename T> class state;
template<typename T> class transition;
/**
* @class transition
* @tparam T
* @brief transition to a given state (on a given letter) and with a given templated prob / cost / polynomial<int> / polynomial<count ??>
*/
template<typename T> class transition {
public:
/** @brief build a transition object
* @param state gives the state number reached by this transition
* @param prob gives the probability of this transition
* @todo{FIXME : clear() function is here needed for "composed T types", since T must also be deleted to free memory; check if the automatic deletion works for T = polynomial<C>}
*/
transition(int state = 0, const T prob = One<T>()) : _state(state), _prob(prob) {};
~transition() {;};
protected :
/// state number to be reached
int _state;
/// transition probability / cost /
T _prob;
/// state is a friend class to ease access
template<typename U> friend class state;
/// automaton is a friend class to ease access
template<typename U> friend class automaton;
/// print transition\<U\> information is friend
template<typename U> friend ostream& operator<<(ostream& os, const transition<U>& tr);
/// read transition\<U\> information is friend
template<typename U> friend istream& operator>>(istream& is, transition<U>& tr);
/// print state\<U\> information is friend
template<typename U> friend ostream& operator<<(ostream& os, const state<U>& st);
/// read state\<U\> information is friend
template<typename U> friend istream& operator>>(istream& is, state<U>& st);
/// print automaton\<U\> information is friend
template<typename U> friend ostream& operator<<(ostream& os, const automaton<U>& au);
/// read automaton\<U\> information is friend
template<typename U> friend istream& operator>>(istream& is, automaton<U>& au);
};
/** @brief output method for a transition\<T\> tr
* @param os is the outputstream to use
* @param tr is the transition\<T\> to output
* @return the outputstream used
*/
template<typename T> inline ostream& operator<<(ostream& os, const transition<T>& tr) {
os << "\t\t" << tr._state << "\t" << tr._prob;
return os;
}
/** @brief input method for a transition\<T\> tr
* @param is is the outputstream to use
* @param tr is the transition<T> to output
* @return the inputstream used
*/
template<typename T> inline istream& operator>>(istream& is, transition<T>& tr) {
// previous data removed if any
//tr._prob.clear(); //FIXME : this must be done for polynomial but it doesnt obviously work for <double>
///@todo{FIXME : clear() function is here needed for "composed T types", since T must also be deleted to free memory; check if the automatic deletion works for T = polynomial<C>}
tr._state = 0;
// reading state
int state = 0;
is >> state;
if (state < 0) {
cerr << "> when reading state" << endl;
_ERROR("transition<T> operator>>","incorrect state number " << state);
}
// reading prob
T prob_read;
is >> prob_read;
// normalize prob by using a product with T(1) : mostly here for user inputs such as "1 + 2 * y ^ 0 + 3 * x ^ 0" ...
T prob = prob_read * One<T>(); // for T = cost<C> template, it will "add" C(0) element
// setting tr
tr._state = state;
tr._prob = prob;
return is;
}
/**
* @class transition
* @tparam void
* @brief transition to a given state (on a given letter)
*/
template<> class transition<void> {
public:
/** @brief build a transition object
* @tparam void
* @param state gives the state number reached by this transition
*/
inline transition<void>(int state = 0) : _state(state) {};
/// Erase a transition
inline ~transition<void>() {;};
protected :
/// state number to be reached
int _state;
/// state is a friend class to ease access
friend class state<void>;
/// automaton is a friend class to ease access
friend class automaton<void>;
/// print transition\<void\> information is friend
friend ostream& operator<<(ostream& os, const transition<void>& tr);
/// read transition\<void\> information is friend
friend istream& operator>>(istream& is, transition<void>& tr);
/// print state\<void\> information is friend
friend ostream& operator<<(ostream& os, const state<void>& st);
/// read state\<void\> information is friend
friend istream& operator>>(istream& is, state<void>& st);
/// print automaton\<void\> information is friend
friend ostream& operator<<(ostream& os, const automaton<void>& au);
/// read automaton\<void\> information is friend
friend istream& operator>>(istream& is, automaton<void>& au);
};
/**
* @class state
* @tparam T
* @brief state represented as a table of list of transitions ("next[gv_align_alphabet_size]")
*/
template<typename T> class state {
public:
/// Build an empty state (empty transition lists)
inline state(int final = 0): _final(final) { _next = std::vector<std::vector<transition<T> > >(gv_align_alphabet_size, std::vector<transition<T> > ());};
/// Erase a state (clear transition lists first)
inline ~state() {
for(int a = 0; a < gv_align_alphabet_size; a++)
_next[a].clear();
_next.clear();
};
/** @brief Clear transition lists
*/
inline void clear() { for(int a = 0; a < gv_align_alphabet_size; a++)
_next[a].clear();
_next.clear();
};
protected:
/// forward transitions list on letter A
std::vector<vector<transition<T> > > _next;
/// final state
int _final;
/// automaton is a friend class to ease access
template<typename U> friend class automaton;
/// print state\<U\> information is friend
template<typename U> friend ostream& operator<<(ostream& os, const state<U>& st);
/// read state\<U\> information is friend
template<typename U> friend istream& operator>>(istream& is, state<U>& st);
/// print automaton\<U\> information is friend
template<typename U> friend ostream& operator<<(ostream& os, const automaton<U>& au);
/// read automaton\<U\> information is friend
template<typename U> friend istream& operator>>(istream& is, automaton<U>& au);
};
/// output method for a state<T> st
template<typename T> inline ostream& operator<<(ostream& os, const state<T>& st) {
os << "\t" << st._final << endl;
for (int a = 0; a < gv_align_alphabet_size; a++) {
os << "\t\t" << a << "\t" << st._next[a].size() << endl;
for (typename std::vector<transition<T> >::const_iterator iter = st._next[a].begin(); iter != st._next[a].end(); iter++) {
os << "\t\t\t" << (*iter) << endl;
}
}
return os;
}
/// input method for a state<T> st
template<typename T> inline istream& operator>>(istream& is, state<T>& st) {
// previous data removed if any
for (int a = 0; a < gv_align_alphabet_size; a++)
st._next[a].clear();
st._next.clear();
st._final = 0;
// reading final
int final = 0;
is >> final;
if (final < 0) {
cerr << "> when reading final" << endl;
_ERROR("state<T> operator>>","incorrect final value " << final);
}
st._final = final;
st._next = std::vector<vector<transition<T> > > (gv_align_alphabet_size);
for (int a = 0; a < gv_align_alphabet_size; a++) {
// reading letter
int a_r;
is >> a_r;
if (a_r != a) {
cerr << "> when reading alphabet symbol" << endl;
_ERROR("state<T> operator>>","incorrect alphabet letter " << a_r << " (expected " << a << ")");
}
// reading next[a] size
int next_a_size;
is >> next_a_size;
if (next_a_size < 0) {
cerr << "> when reading number of transitions for alphabet symbol " << a_r << endl;
_ERROR("state<T> operator>>","incorrect number " << next_a_size);
}
st._next[a] = std::vector<transition<T> > (next_a_size);
// reading each transtion to fill next[a]
for (int i = 0; i < next_a_size; i++) {
is >> st._next[a][i];
}
}
return is;
}
/**
* @class automaton
* @tparam T
* @brief automaton<T> (deterministic or non-deterministic, probabilistic or not) roughly represented as a std::vector of states
*
* this one can either be affected to:
* @li a seed automaton (deterministic, initial state is 1, final states have the final tag)
* @li a probability model (Bernoulli, Markov, and old M3,M14,HMM models, can either be deterministic or non-deterministic)
* @li a target alignment model (deterministic)
*
*/
template<typename T> class automaton {
public:
/// constructor for an empty automaton
automaton() {
_states = std::vector<state<T> >(0);
_init_states = std::vector<int>(0);
};
/// destructor for all possibles automata build by this class
~automaton() {
for (int i = 0; i < (int)_states.size(); i++) {
_states[i].clear();
}
_states.clear();
_init_states.clear();
};
/** @name Build automata
* @brief each one builds a special automaton
*/
// @{
/** @brief simple algorithm to build a linear automaton of the current seed "s", that only matches if the seed match from its very beginning position
* the very beginning of the sequence : only useful for covariance computation
* @param s is the seed descriptor
* @param matchingmatrix is a boolean matrix that gives for alignment letter "a", and seed letter "b", the matching with "matrix[a][b]"
* @return 0
*/
int Automaton_SeedLinearMatching(const seed & s,
const std::vector<std::vector<int> > & matchingmatrix);
/** @class SeedPrefixesMatchingSet_old
* @brief simple "inner class" to keep states @f$ <X,t> @f$ and @f$ k @f$,
* during SeedPrefixMatching_old method
* @see Automaton_SeedPrefixesMatching_old
*/
class SeedPrefixesMatchingSet_old {
public:
/// set of prefixes matching
Seed_X_Word X;
/// lenght of the last run of '1'
short t;
/// length of the maximal prefix in the set @f$ X @f$ : @f$ k = max(X) @f$
short k;
/** @brief build a SeedPrefixesMatchingSet_old
*/
SeedPrefixesMatchingSet_old(Seed_X_Word X = 0, short t = 0, short k = 0) : X(X), t(t), k(k) {};
};
#define SEEDPREFIXESMATCHING_OLD_X(s) (statesSeedPrefixesMatchingSet_old[(s)].X)
#define SEEDPREFIXESMATCHING_OLD_T(s) (statesSeedPrefixesMatchingSet_old[(s)].t)
#define SEEDPREFIXESMATCHING_OLD_K(s) (statesSeedPrefixesMatchingSet_old[(s)].k)
/** @brief Old algorithm in @f$ \mathcal{O}(w 2^{s-w})@f$ for subset seed matching (use on index of size @f$ \mathcal{O}(w 2^{s-w})@f$ )
* @param s is the seed descriptor
* @param matchingmatrix is a boolean matrix that gives for alignment letter "a", and seed letter "b", the matching with "matrix[a][b]"
* @param nomerge if final states dont have to be merged together into a single state (default behaviour is merging)
* @return 0
*/
int Automaton_SeedPrefixesMatching_old (const seed & s,
const std::vector<std::vector<int> > & matchingmatrix,
const bool nomerge = false);
/** @class SeedPrefixesMatchingSet
* @brief simple "inner class" to keep states @f$ <X,t> @f$ and @f$ k @f$, together with @f$ Xp_i @f$ and @f$ RevMaxXp_i @f$ to keep previous index track,
* during SeedPrefixMatching method
* @see Automaton_SeedPrefixesMatching
*/
class SeedPrefixesMatchingSet {
public:
/// set of prefixes matching
Seed_X_Word X;
/// lenght of the last run of '1'
short t;
/// length of the maximal prefix in the set @f$ X @f$ : @f$ k = max(X) @f$
short k;
/// gives the index of the state @f$< X' = X / max(X), t > @f$ (this state does exists and is created before @f$ <X,t> @f$)
int Xp_i;
/** @brief gives the index of last state @f$ <X'',t> @f$ created that verifies :
* @li @f$ X''/max(X'') = X @f$
* @li @f$ max(X'') @f$ is greatest among all created @f$ X'' @f$ verifying the previous condition
*/
int RevMaxXp_i;
/** @brief build a SeedPrefixesMatchingSet
*/
SeedPrefixesMatchingSet(Seed_X_Word X = 0,short t = 0,short k = 0, int Xp_i = 0, int RevMaxXp_i = 0) : X(X), t(t), k(k), Xp_i(Xp_i), RevMaxXp_i(RevMaxXp_i) {};
};
#define SEEDPREFIXESMATCHING_X(s) (statesSeedPrefixesMatchingSet[(s)].X)
#define SEEDPREFIXESMATCHING_T(s) (statesSeedPrefixesMatchingSet[(s)].t)
#define SEEDPREFIXESMATCHING_K(s) (statesSeedPrefixesMatchingSet[(s)].k)
#define SEEDPREFIXESMATCHING_XP_I(s) (statesSeedPrefixesMatchingSet[(s)].Xp_i)
#define SEEDPREFIXESMATCHING_REVMAXXP_I(s) (statesSeedPrefixesMatchingSet[(s)].RevMaxXp_i)
/** @brief New linear algorithm for subset seed matching (CIAA 2007)
* @param s is the seed descriptor
* @param matchingmatrix is a boolean matrix that gives for alignment letter "a", and seed letter "b", the matching with "matrix[a][b]"
* @param nomerge if final states dont have to be merged together into a single state (default behaviour is merging)
* @return 0
*/
int Automaton_SeedPrefixesMatching (const seed & s,
const std::vector<std::vector<int> > & matchingmatrix,
const bool nomerge = false);
/** @class SeedPrefixesMatchingDMSet
* @brief simple "inner class" to keep, for a set of seeds of size @f$ n @f$, the states @f$ <X[1..n],t> @f$ and their maximal prefix @f$ k[1..n] @f$, together with the coverage inside a std::vector @f$ p @f$,
* during SeedPrefixMatching_CoverageDM method
* @see Automaton_SeedPrefixesMatching_CoverageDM
*/
class SeedPrefixesMatchingDMSet {
public:
/// set of prefixes matching for each seed
std::vector<Seed_X_Word> X;
/// lenght of the last run of '1'
short t;
/// length of the maximal prefix in the set @f$ X @f$ : @f$ k = max(X) @f$
std::vector<short> k;
/** set of seed matching symbols as a suffix of the maximal ongoing seed prefix
* this std::vector can be "padded right" by run of '0' (uncovered),
* but "padded left" run of '0' must be removed.
*/
std::vector<int> p;
/// final state value (here can be 0 or any coverage value > 0)
int final;
/** @brief build a SeedPrefixesMatchingSet
*/
SeedPrefixesMatchingDMSet(std::vector<Seed_X_Word> X, std::vector<short> k, std::vector<int> p, short t, int final) : t(t), final(final) {
this->X = std::vector<Seed_X_Word>(X);
this->k = std::vector<short>(k);
this->p = std::vector<int>(p);
};
/** @brief clear a SeedPrefixesMatchingSet
*/
~SeedPrefixesMatchingDMSet() {
this->X.clear();
this->k.clear();
this->p.clear();
};
/** @brief compare two SeedPrefixesMatchingDMSet (used for the map<SeedPrefixesMatchingDMSet>)
* @param lhs is the left SeedPrefixesMatchingDMSet to be compared
* @param rhs is the right SeedPrefixesMatchingDMSet to be compared
* @return true iif (lhs < rhs)
*/
friend bool operator<(const SeedPrefixesMatchingDMSet& lhs, const SeedPrefixesMatchingDMSet& rhs) {
/* compare T size */
if (lhs.t < rhs.t)
return true;
else
if (lhs.t > rhs.t)
return false;
/* then compare "k" size first */
for (unsigned x = 0; x < lhs.k.size(); x++) {
if (lhs.k[x] < rhs.k[x]) {
return true;
} else {
if (lhs.k[x] > rhs.k[x]) {
return false;
}
}
}
/* then go to "X" details */
for (unsigned x = 0; x < lhs.k.size(); x++) {
if (lhs.X[x] < rhs.X[x]) {
return true;
} else {
if (lhs.X[x] > rhs.X[x]) {
return false;
}
}
}
/* then compare "P" size */
if (lhs.p.size() < rhs.p.size())
return true;
else
if (lhs.p.size() > rhs.p.size())
return false;
/* then compare "P" elements */
for (unsigned i = 0; i < lhs.p.size(); i++) {
if (lhs.p[i] < rhs.p[i]) {
return true;
} else {
if (lhs.p[i] > rhs.p[i]) {
return false;
}
}
}
/* then finally : compare final */
if (lhs.final < rhs.final)
return true;
else
if (lhs.final > rhs.final)
return false;
return false;
};
};
#define SEEDPREFIXESMATCHINGDM_S(s) (statesSeedPrefixesMatchingDMSet[(s)])
#define SEEDPREFIXESMATCHINGDM_X(s) (statesSeedPrefixesMatchingDMSet[(s)].X)
#define SEEDPREFIXESMATCHINGDM_T(s) (statesSeedPrefixesMatchingDMSet[(s)].t)
#define SEEDPREFIXESMATCHINGDM_K(s) (statesSeedPrefixesMatchingDMSet[(s)].k)
#define SEEDPREFIXESMATCHINGDM_P(s) (statesSeedPrefixesMatchingDMSet[(s)].p)
#define SEEDPREFIXESMATCHINGDM_FINAL(s) (statesSeedPrefixesMatchingDMSet[(s)].final)
/** @brief Subset seed matching automaton for multiple subset seeds, but here with the Donald Martin extension
* that measures the (non-ovelapping) number of '#' elements that cover a '1' position : extended so
* weight_seed_alphabet
*
* @param s is the set of seeds descriptos that are used to build the automaton
* @param weight_seed_alphabet is the weight given for each seed symbol : when several symbols are under a position,
* then the "winner" of the overlap is the one with the biggest weight
* @param matchingmatrix is a boolean matrix that gives for alignment letter "a", and seed letter "b", the matching with "matrix[a][b]"
* @return 0
* @see Automaton_SeedPrefixesMatching
*/
int Automaton_SeedPrefixesMatching_CoverageDM (const std::vector<seed *> & s,
const std::vector<int> weight_seed_alphabet,
const std::vector<std::vector<int> > & matchingmatrix);
/** @class SeedPrefixesMatchingCostSet
* @brief simple "inner class" to keep states @f$ <X,t> @f$ and @f$ k + Cost @f$
* during SeedPrefixMatching method
* @see Automaton_SeedPrefixesMatchingCost
*/
class SeedPrefixesMatchingCostSet {
public:
/// set of prefixes matching
Seed_X_Word X;
/// lenght of the last run of '1'
short t;
/// length of the maximal prefix in the set @f$ X @f$ : @f$ k = max(X) @f$
short k;
/// gives the index of the state @f$< X' = X / max(X), t > @f$ (this state does exists and is created before @f$ <X,t> @f$)
int Xp_i;
/** @brief gives the index of last state @f$ <X'',t> @f$ created that verifies :
* @li @f$ X''/max(X'') = X @f$
* @li @f$ max(X'') @f$ is greatest among all created @f$ X'' @f$ verifying the previous condition
*/
int RevMaxXp_i;
/// gives the Cost at this state
int Cost;
/** @brief build a SeedPrefixesMatchingCostSet
*/
SeedPrefixesMatchingCostSet(Seed_X_Word X = 0, short t = 0, short k = 0, int Xp_i = 0, int RevMaxXp_i = 0, int Cost = 0) : X(X), t(t), k(k), Xp_i(Xp_i), RevMaxXp_i(RevMaxXp_i), Cost(Cost) {};
};
#define SEEDPREFIXESMATCHINGCOST_X(s) (statesSeedPrefixesMatchingCostSet[(s)].X)
#define SEEDPREFIXESMATCHINGCOST_T(s) (statesSeedPrefixesMatchingCostSet[(s)].t)
#define SEEDPREFIXESMATCHINGCOST_K(s) (statesSeedPrefixesMatchingCostSet[(s)].k)
#define SEEDPREFIXESMATCHINGCOST_XP_I(s) (statesSeedPrefixesMatchingCostSet[(s)].Xp_i)
#define SEEDPREFIXESMATCHINGCOST_REVMAXXP_I(s) (statesSeedPrefixesMatchingCostSet[(s)].RevMaxXp_i)
#define SEEDPREFIXESMATCHINGCOST_COST(s) (statesSeedPrefixesMatchingCostSet[(s)].Cost)
/** @brief Modified version that takes lossless costs into account
* @param s is the seed descriptor
* @param matchingmatrix is a boolean matrix that gives for alignment letter "a", and seed letter "b", the matching with "matrix[a][b]"
* @param nomerge if final states dont have to be merged together into a single state (default behaviour is merging)
* @param costs gives a std::vector of costs for each align alphabet letter
* @param cost_threshold gives a cost threshold that must not be reached by any alignment
* @return 0
* @see Automaton_SeedPrefixesMatching
*/
int Automaton_SeedPrefixesMatchingCost (const seed& s,
const std::vector<std::vector<int> > & matchingmatrix,
const bool nomerge,
const std::vector<int> & costs,
const int cost_threshold);
/** @class SeedBuhlerSet
* @brief simple "inner class" to keep states properties
* during SeedBuhler (Aho Corasick) method
* @see Automaton_SeedBuhler
*/
class SeedBuhlerSet {
public:
/// Aho-Corasick Fail function for this state
int Fail;
/// Number of transition that need to be completed for this state
short Rest;
/// Level (distance from the root) in the AC-tree for this state
short Level;
/** @brief build a SeedBuhlerSet
*/
SeedBuhlerSet(int Fail = 0, short Rest = 0, short Level = 0) : Fail(Fail), Rest(Rest), Level(Level) {};
};
#define SEEDBUHLER_FAIL(s) (statesSeedBuhlerSet[(s)].Fail)
#define SEEDBUHLER_REST(s) (statesSeedBuhlerSet[(s)].Rest)
#define SEEDBUHLER_LEVEL(s) (statesSeedBuhlerSet[(s)].Level)
/** @brief Aho-Corasick automaton used by J.Buhler
* @param s is the seed descriptor
* @param matchingmatrix is a boolean matrix that gives for alignment letter "a", and seed letter "b", the matching with "matrix[a][b]"
* @param nomerge if final states dont have to be merged together into a single state (default behaviour is merging)
* @return 0
*/
int Automaton_SeedBuhler (const seed & s,
const std::vector<std::vector<int> > & matchingmatrix,
const bool nomerge = false);
/** @class SeedScoreSet
* @brief simple "inner class" to keep states properties
* during SeedScore method
* @see Automaton_SeedScore
*/
class SeedScoreSet {
public:
/// Aho-Corasick Fail function for this state
int Fail;
/// Score reached at this state
short Score;
/// Level (distance from the root) in the AC-tree for this state
short Level;
/** @brief build a SeedScoreSet
*/
SeedScoreSet(int Fail = 0, short Score = 0, short Level = 0) : Fail(Fail), Score(Score), Level(Level) {};
};
#define SEEDSCORE_FAIL(s) (statesSeedScoreSet[(s)].Fail)
#define SEEDSCORE_SCORE(s) (statesSeedScoreSet[(s)].Score)
#define SEEDSCORE_LEVEL(s) (statesSeedScoreSet[(s)].Level)
/** @brief Aho-Corasick automaton with scoring method to prune prefixes
* @param s is the seed descriptor
* @param matchingmatrix is a boolean matrix that gives for alignment letter "a", and seed letter "b", the matching with "matrix[a][b]"
* @param scoringmatrix is an integer matrix that gives for alignment letter "a", and seed letter "b", the score with "matrix[a][b]"
* @param scoringthreehold is the minimal score that has to be reached to have a match for the seed
* @param nomerge if final states dont have to be merged together into a single state (default behaviour is merging)
* @return 0
*/
int Automaton_SeedScore (const seed & s,
const std::vector<std::vector<int> > & matchingmatrix,
const std::vector<std::vector<int> > & scoringmatrix,
const int scoringthreehold,
const bool nomerge = false);
/** @class SeedScoreCostSet
* @brief simple "inner class" to keep states properties,
* during SeedScoreCost method
* @see Automaton_SeedScore
*/
class SeedScoreCostSet {
public:
/// Aho-Corasick Fail function for this state
int Fail;
/// Score reached at this state
short Score;
/// Level (distance from the root) in the AC-tree for this state
short Level;
/// gives the Cost at this state
int Cost;
/** @brief build a SeedScoreCostSet
*/
SeedScoreCostSet(int Fail = 0, short Score = 0, short Level = 0,int Cost = 0) : Fail(Fail), Score(Score), Level(Level), Cost(Cost) {};
};
#define SEEDSCORECOST_FAIL(s) (statesSeedScoreCostSet[(s)].Fail)
#define SEEDSCORECOST_SCORE(s) (statesSeedScoreCostSet[(s)].Score)
#define SEEDSCORECOST_LEVEL(s) (statesSeedScoreCostSet[(s)].Level)
#define SEEDSCORECOST_COST(s) (statesSeedScoreCostSet[(s)].Cost)
/** @brief Modified version that takes lossless costs into account
* @param s is the seed descriptor
* @param matchingmatrix is a boolean matrix that gives for alignment letter "a", and seed letter "b", the matching with "matrix[a][b]"
* @param scoringmatrix is an integer matrix that gives for alignment letter "a", and seed letter "b", the score with "matrix[a][b]"
* @param scoringthreehold is the minimal score that has to be reached to have a match for the seed
* @param nomerge if final states dont have to be merged together into a single state (default behaviour is merging)
* @param costs gives a std::vector of costs for each align alphabet letter
* @param cost_threshold gives a cost threshold that must not be reached by any alignment
* @return 0
* @see Automaton_SeedScore
*/
int Automaton_SeedScoreCost(const seed & s,
const std::vector<std::vector<int> > & matchingmatrix,
const std::vector<std::vector<int> > & scoringmatrix,
const int scoringthreehold,
const bool nomerge,
const std::vector<int> & costs,
const int cost_threshold);
/** @brief build a probabilistic bernoulli model
* @param p gives the probability of each letter of A (sum must be equal to 1.0)
* @return 0
*/
int Automaton_Bernoulli(const std::vector <T> & p /*[gv_align_alphabet_size]*/);
/** @brief build a probabilistic markov model of order k
* @param p gives the probability of each word of A^k (sum must be equal to 1.0)
* @param k gives the order
* @return 0
*/
int Automaton_Markov(const std::vector<T> & p,
const int k);
/** @brief build an Homogeneous sequence automaton
* @details It represents an alignment such that no substring of the alignment has a score greater than the full alignment
* - state 0 : accepting state
* - state 1 : starting state
* - state 2 : rejecting state
* @param scores gives a table of scores for each of the A letters
* @param length gives the alignment length
* @return 0
*/
int Automaton_Homogeneous(const std::vector<int> & scores,
const int length);
/** @brief build an Counting symbols automaton (automaton with two states)
* @details It gives a 1-final state if "a >= min_a", or the init state (non-final) otherwise
* with a T(1) value on each transition to perform counting tasks : must be used with counting semi-rings
* @param min_a minimal symbol that trigger a count
* @return 0
*/
int Automaton_CountAlphabetSymbols(const int min_a = gv_align_alphabet_size-1);
/** @brief build a cyclic automaton
* @param cycle gives the cycle size
* @param final_list gives a table of positions that are final ( 0 <= final < cycle)
* @param final_nb gives the size of the previous table
* @return 0
*/
int Automaton_Cycle(const int cycle, const int * const final_list, const int final_nb);
/** @brief build a lossless automaton
* @param costs gives a table of costs for each of the A letters
* @param max_cost is the maximal cost that can be reached so that the automaton is not in an accepting state
* @return 0
*/
int Automaton_Lossless(const std::vector<int> & costs, const int max_cost);
// @}
/** @name Manipulate automata
* @brief reduction/product
*/
// @{
#define REVERSESTATE(c,a,b) (((c) == (a))?(b):(((c) == (b))?(a):(c)))
/** @brief Hopcroft automaton minimization
* @return the minimized automaton, does not affect the current automaton
* @warning this method ignores \<T\> transition values
*/
automaton<T> * Hopcroft() const;
/** @brief Isomorphism check
* @param other is the automaton to be compared to
* @return true if the two automata are isomorph
* @warning this method ignores \<T\> transition values
*/
bool isIsomorphTo(const automaton<T> & other) const;
/** @brief Generic Automata Product
*
* Two definitions exist:
* - one if the second automaton type \<U\> is "void" (cannot be stored in the resulting automaton),
* - another one when type \<U\> can be used for each transition built.
*
* @param other is the second automaton used for the product
* @param productSetFinalType indicates if product final states are the crossproduct of both automaton final states or only one of these
* @li PRODUCT_UNION_xxx : automata "union" of final states,
* @li PRODUCT_INTERSECTION_xxx : automata "intersection" of final states
* @li PRODUCT_BUTNOT_xxx : automata "this".final BUT NOT "other".final
* @li PRODUCT_NOTBUT_xxx : automata NOT "this".final BUT "other".final
* @li PRODUCT_UNION_NO_LOOP_ADD : automata "union" and "sum value" of final states, (integer value)
* @li PRODUCT_ADDHOC_xxx : automata addhoc final function aff does this work
* with
* @li xxx : LOOP / NO_LOOP : indicates if the final state is a single one absorbant (force it, otherwise keep it as in the "true" product)
* @param depth indicates the maximal depth that must be reached : extra states are non final selflooping states
* (by default, this value is greater than 2 Billions, but the given alignment length should be enought in most cases)
* @param aff function indicates the final value to be used, is is used when @param productSetFinalType = PRODUCT_ADDHOC_xxx
* @param shift is a positive or negative value that reset the first or second automaton on its initial state during a defined depth given by this parameter
* @return a new automaton that only gives reachable states of the product
* @see matrix_product
*/
#ifdef HAS_STD_TYPE_TRAITS
template<typename U> typename enable_if_ca <std::is_void<U>::value, automaton<U> * >::type
#else
template<typename U> typename enable_if_ca <std::tr1::is_void<U>::value, automaton<U> * >::type
#endif
product(const automaton<U> & other,
const ProductSetFinalType productSetFinalType,
const int depth = INT_INFINITY,
const AddHoc_Final_Func aff = NULL,
const int shift = 0) const {
VERB_FILTER(VERBOSITY_DEBUGGING, INFO__("== Product == (productsetfinaltype:" << dec << productSetFinalType << ")"););
#ifdef ASSERTB
if ((this->size() * other.size()) > (1 << 28)) {
_ERROR("product"," size of product automaton will \"certainly\" explode : better stop here ...");
}
#endif
automaton<U> * result = new automaton<U>();
int StateFinal = result->addNewState(TRUE);
result->selfLoop(StateFinal);
#ifdef USEMAPPRODUCT
typedef less<std::pair<int,int> > lessp;
typedef map<std::pair<int,int>, int, lessp > maptype;
maptype statesNbIndex;
#define PRODINDEX(i) (i)
#else
std::vector<int> statesNbIndex( this->size() * other.size(), 0);
#define PRODINDEX(i) ((i).first * other.size() + (i).second)
#endif
#ifdef USEQUEUEPRODUCT
std::queue<std::pair<int,int> > statesNbRemaining;
#else
std::stack<std::pair<int,int> > statesNbRemaining;
#endif
// (0) final loop or not
int final_loop = (productSetFinalType == PRODUCT_UNION_FINAL_LOOP || productSetFinalType == PRODUCT_INTERSECTION_FINAL_LOOP || productSetFinalType == PRODUCT_BUTNOT_FINAL_LOOP || productSetFinalType == PRODUCT_NOTBUT_FINAL_LOOP || productSetFinalType == PRODUCT_ADDHOC_FINAL_LOOP) ? TRUE : FALSE;
// (1) start the product init state
std::pair<int,int> indexInit = std::pair<int,int>(1,1);
int stateInit = result->addNewState();
statesNbIndex[PRODINDEX(indexInit)] = stateInit;
statesNbRemaining.push(indexInit);
if (
gv_subalignment_flag && ((this->_init_states.size() > 0) ||
(other._init_states.size() > 0)
)
) {
result->_init_states.push_back(stateInit);
}
#ifdef USEQUEUEPRODUCT
// depth of the states being built
int level_i = 0;
int stateN_of_level_i = stateInit;
#endif
// (2) take all the non-considered interesting cases pairs on both automatons
while (!statesNbRemaining.empty()) {