forked from laurentnoe/iedera
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cc
4341 lines (3848 loc) · 177 KB
/
main.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* @mainpage iedera
* @version 1.07
* @section Introduction
* Iedera is a program to select and design subset seeds and vectorized
* subset seeds. This program heavily relies on automaton theory to compute the
* sensitivity of seed that are represented by the subset seed model, or the
* vectorized subset seed model.
*
*
*
* @section Download
* Download the iedera source code and binaries on the following web page:\n
* <A HREF="http://bioinfo.univ-lille.fr/yass/iedera.php">http://bioinfo.univ-lille.fr/yass/iedera.php</A>\n
* A small tutorial to use iedera is also provided on this web page.\n
*
* @author <A HREF="http://www.cristal.univ-lille.fr/~noe" TARGET="_top">Laurent Noe</A>
* @author <A HREF="http://www.mccme.ru/lifr/pers/roytberg.htm">Mikhail Roytberg</A>
* @author <A HREF="http://www-igm.univ-mlv.fr/~koutcher/" TARGET="_top">Gregory Kucherov</A>
*
*
*/
/** @defgroup main main program
* @brief main program, with pareto seed/sens/sel attributes, and the set of global variables (gv_...)
* used to parametrise the computation
*/
// @{
//C stuff
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <limits>
//STL
#include <vector>
#include <list>
#include <utility>
#include <algorithm>
#include <iterator>
//STD
#include <iostream>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <string>
//using namespace std;
//STR
#include "macro.h"
#include "seed.h"
#include "automaton.hh"
#include "matrix.hh"
#ifdef USEINFINT
#include "infint.hh"
/// BIGINT polynomial coefs computation defined as Infinite Integer (much slower ... but overflow secure)
#define BIGINT infint<long long>
#else
/// BIGINT polynomial coefs computation defined as long long (quite fast ... but overflows quickly ...)
#define BIGINT long long
#endif
/** @name program name and version number
* @brief defined if not provided by the compiler
*/
// @{
#ifndef PACKAGE
/// iedera program name
#define PACKAGE "iedera"
#endif
#ifndef VERSION
/// iedera program version
#define VERSION "1.07"
#endif
// @}
/** @name subset matching matrix (uninitialized)
* @brief must be initialized in the main function
*/
// @{
/// boolean matching matrix
std::vector<std::vector<int> > gv_subsetseed_matching_matrix;
/// build an initial matrix gv_subsetseed_matching_matrix @see gv_subsetseed_matching_matrix
void build_default_subsetseed_matching_matrix();
// @}
/** @name background and foreground probabilities (uninitialized)
* @brief must be initialized in the main function
*/
// @{
/// foreground sensitivities
std::vector<double> gv_bsens;
/// k-order model when Markov is activated (0 for Bernoulli) @see gv_bsens_k
int gv_bsens_k = 0;
/// probabilistic automaton associated with the sensitivity computation @see gv_bsens
automaton <double> * gv_bsens_automaton = NULL;
/// background seletivities @see gv_bsel_k
std::vector<double> gv_bsel;
/// k-order model when Markov is activated (0 for Bernoulli) @see gv_bsel
int gv_bsel_k = 0;
/// build probabilities @see gv_bsens,gv_bsens_k,gv_bsel,gv_bsel_k
void build_default_probabilities();
// @}
/** @name vectorized seeds (uninitialized)
* @brief generate subset seeds with an additional scoring constraint
*/
// @{
/// vector scoring matrix when vectorized subset seeds are activated @see gv_vectorized_flag,gv_vectorizedsubsetseed_scoring_threshold
std::vector<std::vector<int> > gv_vectorizedsubsetseed_scoring_matrix;
/// scoring threshold for a vectorized subset seed @see gv_vectorizedsubsetseed_scoring_matrix
int gv_vectorizedsubsetseed_scoring_threshold = 0;
/// flag that activates vectorized subset seeds (default subset seed)
bool gv_vectorized_flag = false;
/// build an initial matrix gv_vectorizedsubsetseed_scoring_matrix @see gv_vectorizedsubsetseed_scoring_matrix
void build_default_vectorizedsubsetseed_scoring_matrix();
// @}
/** @name seed enumeration
* @brief parameters used to enumerate seeds
*/
// @{
/// Hopcroft minimization applied on each deterministic automaton
bool gv_minimize_flag = true;
/// hillclimbing local optimization for each seed and set of positions
bool gv_hillclimbing_flag = false;
/// hillclimbing local optimization frequency
double gv_hillclimbing_alpha = 5e-3;
/// flag set when a seed is given on command line
bool gv_motif_flag = 0;
/// set of seeds being computed or optimized
std::vector<seed*> gv_seeds;
/// maximal difference of weight (given as a ratio) between seeds being optimized together
double gv_jive = 0.0;
// @}
/** @name cycle parameters
* @brief generates position restricted seeds of a cycle
*/
// @{
/// cycle flag @see gv_cycles
bool gv_cycles_flag = false;
/// cycle size (one for each seed) @see gv_cycles_flag
std::vector<int> gv_cycles;
/// cycle pos (number of pos per cycle) @see gv_cycles_pos_list,gv_cycles_flag
std::vector<int> gv_cycles_pos_nb;
/// cycle pos (pos for each cycle) @see gv_cycles_flag,gv_cycles_pos_nb
std::vector<std::vector<int> > gv_cycles_pos_list;
// @}
/* @name multihit criterion
* @brief consider a hit if at least $m$-hits (of any of the seeds) have been counted
*/
// @{
/// flag that activates the mhits @see gv_multihit_nb
bool gv_multihit_flag = false;
/// number of mhits (must be >= 1 if the flag is activated) @see gv_multihit_flag
int gv_multihit_nb = 0;
// @}
/* @name DM coverage criterion
* @brief consider a hit if the DM (Donald Martin) coverage criterion is reached
*/
// @{
/// flag that activates the coverage criterion
bool gv_global_coverage_flag = false;
/// number of element to be covered (must be >= 1 if the flag is activated) @see gv_multihit_flag
int gv_global_coverage_nb = 0;
/// coverage cost for each seed alphabet element
std::vector<int> gv_global_coverage_cost;
// @}
/** @name excluded seeds
* @brief exclude seeds motif from alignments generated
*/
// @{
/// excluded seeds
std::vector<seed*> gv_xseeds;
// @}
/** @name excluded seed cycle parameters
* @brief generates position restricted seeds of a cycle
*/
// @{
/// cycle flag @see gv_xseeds_cycles
bool gv_xseeds_cycles_flag = false;
/// cycle size (one for each seed) @see gv_xseeds_cycles_flag
std::vector<int> gv_xseeds_cycles;
/// cycle pos (number of pos per cycle) @see gv_xseeds_cycles_pos_list,gv_xseeds_cycles_flag
std::vector<int> gv_xseeds_cycles_pos_nb;
/// cycle pos (pos for each cycle) @see gv_xseeds_cycles_flag,gv_xseeds_cycles_pos_nb
std::vector<std::vector<int> > gv_xseeds_cycles_pos_list;
// @}
/* @name excluded seed multihit criterion
* @brief consider a hit if at least $m$-hits (of any of the excluded seeds) have been counted
*/
// @{
/// flag that activates the mhits @see gv_xseeds_multihit_nb
bool gv_xseeds_multihit_flag = false;
/// number of mhits (must be >= 1 if the flag is activated) @see gv_xseeds_multihit_flag
int gv_xseeds_multihit_nb = 0;
// @}
/** @name homogeneous alignments
* @brief generates homogeneous alignments according to scoring parameters
*/
// @{
/// flag that activates computation on homogeneous alignments
bool gv_homogeneous_flag = false;
/// score for each letter of the alignment alphabet
std::vector<int> gv_homogeneous_scores;
// @}
/** @name correlation computation
* @brief consider correlation computation on the set of alignments (and not sensitivity)
*/
// @{
/// flag that activates the correlation computation
bool gv_correlation_flag = false;
/// function choosen for the correlation computation @see gv_correlation_flag
int gv_correlation_function_index = 0;
/// function numbers for the correlation computation @see gv_correlation_function_index
#define CORRELATION_FUNCTIONS_ENUM_PEARSON 0
#define CORRELATION_FUNCTIONS_ENUM_SPEARMAN 1
#define CORRELATION_FUNCTIONS_ENUM_NUMBERS 2
/// function symbols for the correlation computation
char * gv_correlation_functions_names[CORRELATION_FUNCTIONS_ENUM_NUMBERS] = {(char *)"PEARSON",(char *)"SPEARMAN"};
// @}
/** @name simple covariance optimization
* @brief consider computing the covariance as done in "Morgenstern, Zhu et al. AMB 2015"
*/
// @{
bool gv_covariance_flag = false;
// @}
/** @name polynomial output
* @brief consider the coefficients on the set of alignments
*/
// @{
/// flag that activate the output of the "Mak Benson 2009" coefficients (number of alignments with @f$ x @f$ mismatches that are not detected, or detected, or with coverage/multihit value ...). Note that it can be used with "Chung Park 2010" definition of integral hits, but you need to replace the @f$ p^{l-x} \cdot (1-p)^{x} @f$ factor preceeding each coefficient for each mismatch rate of @f$ \frac{x}{l} @f$, by its integral @f$ \int_{p_a}^{p_b} p^{l-x} \cdot (1-p)^{x} dp = \Big[ p^{l-x+1} \sum_{j=0}^{x} {x \choose j} \frac{(-p)^j}{l-x+j} \Big]_{p_a}^{p_b} @f$. Fast and precise computation can be obtained with a fast power of the coefficients of the initial polynomial @f$ p^n (1-p)^x @f$ to avoid some small p "vs" big binomial problem.
/// flag that activate the selection of dominant seeds as in "Mak Benson 2009", which is also true for "Chung Park 2010"
/// (and not the selection according to their sensitivity only: its a more general form that may select more seeds)
bool gv_polynomial_dominant_selection_flag = false;
/// flag that activate multinomial evaluation (note : this is independent from the previous dominance selection). Note that this is only a polynomial evaluation, and does not have any other effect than ... FIXME
bool gv_multipoly_file_flag = false;
automaton<polynomial<BIGINT > > * gv_multipoly_bsens_automaton = NULL;
// @}
/** @name data management
* @brief pareto input/output files
*/
// @{
/// pareto output filename @see outputPareto
char * gv_output_filename = NULL;
/// pareto input filenames @see inputPareto
char * gv_input_filenames[256] = {NULL};
/// pareto input filenames (number of files) @see gv_input_filenames
int gv_nb_input_filenames = 0;
/// pareto selection every nbruns (print statitics also)
int gv_pareto_select_runs = 1000;
// @}
/** @name correlation coefficients computations
* @brief binomial coefficients and associated weight used by correlation coefficients computations
*/
// @{
/// binomial coefficients, must be pre-computed for correlations algorithms
std::vector<std::vector<BIGINT> > binomials;
/// binomials computed with the Pascal Triangle (overflow are checked and stop the program)
void generate_binomials(int N) {
binomials = std::vector<std::vector<BIGINT> >(N+1, std::vector<BIGINT>(N+1,0));
int k, n;
for (k = 1; k <= N; k++) binomials[0][k] = 0;
for (n = 0; n <= N; n++) binomials[n][0] = 1;
for (n = 1; n <= N; n++) {
for (k = 1; k <= N; k++) {
binomials[n][k] = binomials[n-1][k-1] + binomials[n-1][k];
#ifndef USEINFINT
if (binomials[n][k] < 0) {
_ERROR("this binary has been compiled with undefined USEINFINT (no infinite precision integer)"," generate_binomials DOES OVERFLOW ...");
}
#endif
}
}
}
/// tools used to reduce size of binomials weight : greatest common divisor of all of them
BIGINT gcd(BIGINT a, BIGINT b) {
if (a < b) {
BIGINT t = b; b = a; a = t;
}
while (b > 0) {
BIGINT t = b; b = a % b; a = t;
}
return a;
}
/// tools used to reduce size of binomials weight : lowest common multiple of all of them
BIGINT lcm(BIGINT a, BIGINT b) {
return a/gcd(a,b) * b;
}
/// binomial weight (inverted values of binomials coeeficients), must be pre-computed for correlations algorithms
#ifdef USEINFINT
std::vector<std::vector<BIGINT> > binomial_weights;
#else
std::vector<std::vector<double> > binomial_weights;
#endif
/// binomial weight flag, must be set to false when already computed
bool gp_binomial_weights_not_computed_flag = true;
void generate_binomials_weights(int N) {
#ifdef USEINFINT
binomial_weights = std::vector<std::vector<BIGINT> > (N+1, std::vector<BIGINT>(N+1,0));
#else
binomial_weights = std::vector<std::vector<double> > (N+1, std::vector<double>(N+1,0));
#endif
generate_binomials(N);
/*
* lcm { binomial(k,0) ; binomial(k,1) ; ... ; binomial(k,k) } = lcm {1;2;...;k;k+1} / (k+1)
* "An identity involving the least common multiple of binomial coefficients and its application" Bakir FARHI
*/
BIGINT lcmc = 1;
for (long long k_plus_1 = 1; k_plus_1 <= N+1 ; k_plus_1++) {
lcmc = lcm(lcmc,k_plus_1);
#ifndef USEINFINT
if (lcmc < 0) {
_ERROR("this binary has been compiled with undefined USEINFINT (no infinite precision integer)"," generate_binomials_weights DOES OVERFLOW ...");
}
#endif
BIGINT lcmc_div_k_plus_1 = lcmc / (BIGINT) k_plus_1;
long long k = k_plus_1 - 1;
for (long long i = 0; i <= k ; i++) {
BIGINT lcmc_div_k_plus_1_div_binomial_k_i = lcmc_div_k_plus_1/(binomials[k][i]);
#ifdef USEINFINT
binomial_weights[k][i] = lcmc_div_k_plus_1_div_binomial_k_i;
#else
binomial_weights[k][i] = (double)lcmc_div_k_plus_1_div_binomial_k_i;
#endif
}
}
}
// @}
/** @name command line functions
* @brief set of command line management functions
*/
// @{
/// display a 2d matrix (a vector of vectors of int)
void DISPLAYMATRIX( std::ostream & os, std::vector<std::vector<int> > & matrix );
/// describe the command line parameters
void USAGE() {
cerr << "* USAGE ("<< PACKAGE << " v" << VERSION << ") :" << endl;
cerr << " " << PACKAGE << " [options]" << endl;
cerr << " -h : display this Help screen" << endl;
cerr << " -v <int> : set verbose mode (default = " << gv_verbose << ")" << endl;
cerr << " " << endl;
cerr << " 1) Seed Model :" << endl;
cerr << " a) Data " << endl;
cerr << " -A <int> : align alphabet size \'A\' (default = " << gv_align_alphabet_size << ")" << endl;
cerr << " -B <int> : seed alphabet size \'B\' (default = " << gv_seed_alphabet_size << ")" << endl;
cerr << " -M \"{{<bool>}}\" table : subset seed Matching matrix (default = \""; DISPLAYMATRIX(cerr, gv_subsetseed_matching_matrix); cerr << "\")" << endl;
cerr << " * note : parameter is a C-like matrix of 'A'x'B' cols x rows " << endl;
cerr << " * example : -M \"{{1,0,1},{0,1,1}}\" (for A=3 and B=2) " << endl;
cerr << " -MF <filename> : gives the matrix inside a file (when too large for command line)" << endl;
cerr << " " << endl;
cerr << " b) Subset Seed / Vectorized Subset Seed " << endl;
cerr << " -V : select the Vectorized subset seed model (default = disabled)" << endl;
cerr << " -S \"{{<int>}}\" table : vectorized subset seed Scoring table (default = \""; DISPLAYMATRIX(cerr, gv_vectorizedsubsetseed_scoring_matrix); cerr << "\")" << endl;
cerr << " -SF <filename> : gives the table inside a file (when too large for command line)" << endl;
cerr << " -T <int> : vectorized subset seed scoring minimal Threshold (default = " << gv_vectorizedsubsetseed_scoring_threshold << ")" << endl;
cerr << " " << endl;
cerr << " c) Subset Seed / Lossless Subset Seed " << endl;
cerr << " -L <int>,<int>,... : select the Lossless mode and set and the lossless alignment costs" << endl;
cerr << " -X <int> : set the alignment maX cost to be considered in the alignment set" << endl;
cerr << " " << endl;
cerr << " 2) Alignments:" << endl;
cerr << " -b <dbl>,<dbl>,... : set the Bernoulli/Markov background distribution (size |'A'|^k)" << endl;
cerr << " -f <dbl>,<dbl>,... : set the Bernoulli/Markov foreground distribution (size |'A'|^k)" << endl;
cerr << " -fF <filename> : set the foreground model (as a probabilistic NFA)" << endl;
cerr << " for more details, see http://bioinfo.univ-lille.fr/yass/iedera.php#fFFormat" << endl;
cerr << " -l <int> : set the alignment length (default = " << gv_alignment_length << ") " << endl;
cerr << " -ll <int> : select the sub-align window computation and set its length (default = disabled)" << endl;
cerr << " -llf <function> : set the sub-align function used to merge the results (default = \"min\")" << endl;
cerr << " * note : available functions are ";
for (int i=0; i<SUBALIGNMENT_FUNCTIONS_NUMBER; i++) {
cerr << " \"" << gv_subalignment_functions_names[i] << "\"";
}
cerr << endl;
cerr << " -u <int>,<int>,... : select the homogeneous alignment model and set the scores (default = disabled)" << endl;
cerr << " " << endl;
cerr << " 3) Seed Selection / Enumeration :" << endl;
cerr << " -n <int> : number of consecutive seeds to design (default = " << gv_seeds.size() << ")" << endl;
cerr << " -s <int>,<int> : minimal/maximal seed span (default min = " << gv_minspan << ",max = " << gv_maxspan << ")" << endl;
cerr << " -w <dbl>,<dbl> : minimal/maximal seed weight (default min = " << gv_minweight << ", max = " << gv_maxweight << ")" << endl;
cerr << " * symbol # (if defined, otherwise the max of all symbols) is of weight 1.0 " << endl;
cerr << " * symbol _ is of weight 0.0 " << endl;
cerr << " * symbol e weight is given by 'log_#(e matches background distribution)'" << endl;
cerr << " -i <int>,<int>,... : signature id (number of '0','1',...'B-1' elements inside a subset seed)" << endl;
cerr << " * note : the first number can be 0 to be adjusted on the fly depending of seed span." << endl;
cerr << " * example : for transitive seeds, \"-i 0,2,8\" enumerate seeds with 8 \'#\', 2 \'@\', and any number of \'-\'." << endl;
cerr << " * note : \"-i shuffle\" set after \"-m <patterns>\" shuffles seeds elements independently without changing weight/span" << endl;
cerr << " -j <dbl> : difference of weight (given as a ratio) allowed between consecutive seeds." << endl;
cerr << " * note : between 0.0 for miminal change and 1.0 for maximal change (default = " << gv_jive << ")" << endl;
cerr << " -r <int> : random enumeration (default = " << gv_nbruns << " is complete enumeration)" << endl;
cerr << " * note : must be used with -k hill climbing for better performances. " << endl;
cerr << " -k : hill climbing heuristic (for random enumeration)." << endl;
cerr << " * note : it checks all permutations of two different elements inside one seed," << endl;
cerr << " it checks the edit (add/removal) of one jocker '-' inside the seed." << endl;
cerr << " -a <dbl> : hill climbing heuristic frequency: good seeds are more likely to be optimized" << endl;
cerr << " -x : only symetric seeds are selected" << endl;
cerr << " -y <int> : consider sensitivity when y hits are required (multihit sensitivity)" << endl;
cerr << " -g <int> : consider sensitivity when g coverage is required (coverage sensitivity)" << endl;
cerr << " * note : when a seed hits, its coverage is computed as the sum of its elements symbols per position" << endl;
cerr << " and for several possibly overlapping seed hits, the \"best\" element per position is kept." << endl;
cerr << " * example : \"##-# \" gives 5 on a spaced seed alphabet, \"#@-# \" gives 9 (4x2+1) on a transitive alphabet," << endl;
cerr << " \" ##-#\" because 5'#' with '#'=1. \" #@-#\" because 4'#' wt '#'=2, one '@' wt '@'=1."<< endl;
cerr << " * note : only available with subset seeds (not with vectorized subset seeds)." << endl;
cerr << " " << endl;
cerr << " * -y/g note : -y/-g can be used with SPEARMAN/PEARSON correlation with the alignment %%id (only for -A 2)." << endl;
cerr << " you can select the minimal number of matches (%%id) by setting -y <int> before -y <SPEARMAN/PEARSON>" << endl;
cerr << " * example : \"-spaced -l 64 -y 32 -y PEARSON\" for a %%id of 50%% and PEARSON correlation computation" << endl;
cerr << " -p : activate \"Mak Benson 2009\" dominant selection and output polynomial." << endl;
cerr << " * note : this is useful for iid models only (Bernoulli,...) " << endl;
cerr << " * example : \"-spaced -l 8 -m \"##-#\" -p\" must output the additional values :"<< endl;
cerr << " 0,0=1;1,0=8;2,0=28;3,0=51;4,0=45;5,0=15;6,0=1;"<< endl;
cerr << " 3,1=5;4,1=25;5,1=41;6,1=27;7,1=8;8,1=1;" << endl;
cerr << " " << endl;
cerr << " for the (matching) polynomial (second line of additional values: terms contain \",1\"):" << endl;
cerr << " 3 8-3 4 8-4 5 8-5 6 8-6 7 8-7 8 8-8" << endl;
cerr << " 5.p.q + 25.p.q + 41.p.q + 27.p.q + 8.p.q + 1.p.q" << endl;
cerr << " with q = 1-p" << endl;
cerr << " " << endl;
cerr << " or the complementary (non-matching) polynomial (first line of additional values: terms contain \",0\"):" << endl;
cerr << " 0 8-0 1 8-1 2 8-2 3 8-3 4 8-4 5 8-5 6 8-6" << endl;
cerr << " 1.p.q + 8.p.q + 28.p.q + 51.p.q + 45.p.q + 15.p.q + 1.p.q" << endl;
cerr << " " << endl;
cerr << " -pF <filename> : activate general polynomial evaluation and load the associated file automaton." << endl;
cerr << " * example : \"-spaced -l 5 -m \"##-#\" -pF _file_\" where the _file_ is:" << endl;
cerr << " " << endl;
cerr << " |3 0 1 0 0" << endl;
cerr << " | 1 0" << endl;
cerr << " | 1 0 0 1 1 x" << endl;
cerr << " | 1 1 2 1 - x" << endl;
cerr << " | 2 0 0 1 1 1 - y" << endl;
cerr << " | 1 1 2 y" << endl;
cerr << " " << endl;
cerr << " will give the following result (with spaces between each ^*+- symbols):" << endl;
cerr << " " << endl;
cerr << " [y - x*y - x*y^2 + 2*x*y^3 - x^2*y + 2*x^2*y^2 - 2*x^2*y^3 + x^3*y - x^3*y^2]" << endl;
cerr << " " << endl;
cerr << " -c <int>,<int>,... : consider sensitivity when each seed is indexed on 1/c of its positions" << endl;
cerr << " * note : the position is enumerated or choosen randomly depending on -r parameter" << endl;
cerr << " * example : \"##-#:1/5\" means that the seed \"##-#\" will be placed at 1st,6th,11th... positions" << endl;
cerr << " -q <int>,<int>,... : consider sensitivity when each seed is indexed on q/c of its positions" << endl;
cerr << " * note : parameter -c must be specified before, positions are enumerated or randomly drawn (-r)" << endl;
cerr << " * example : \"##-#:2,5/5\" means that the seed \"##-#\" will be placed at 2nd,5th,7th,10th... positions" << endl;
cerr << " -d : disable Hopcroft minimization (default is enabled) " << endl;
cerr << " -m \"<seeds>\" : check the given seed selectivity/sensitivity " << endl;
cerr << " * note : possibility to add cycle content as shown before (see -c -q \"* example :\" )" << endl;
cerr << " -mx \"<seeds>\" : generate alignments that exclude the given seed pattern" << endl;
cerr << " * note : possibility to add cycle content as shown before (see -c -q \"* example :\" )" << endl;
cerr << " -mxy <int> : as -y, consider -mx when y hits are required (multihit criterion)" << endl;
cerr << " " << endl;
cerr << " 4) Input / Ouput :" << endl;
cerr << " -BSymbols '<char>' : seed alphabet symbols to display (default is '01..9AB..Zab..z')" << endl;
cerr << " -e <filename> : input an initial \"pareto set\" file before optimizing" << endl;
cerr << " -o <filename> : output the \"pareto set\" to this file after optimizing" << endl;
cerr << " -z <int> : print statistics every \"z\" seeds (default = " << gv_pareto_select_runs << ", 0 to disable)" << endl;
cerr << " " << endl;
cerr << " 5) Nucleic Spaced Seeds : (shortcuts : you may overload the weight -w and span -s after)" << endl;
cerr << " -transitive : \"-A 3 -B 3 -f .15,.15,.70 -b 0.5,0.25,0.25 -BSymbols '-@#' -l 64 -s 1,8\"" << endl;
cerr << " -spaced : \"-A 2 -B 2 -f .30,.70 -b 0.75,0.25 -BSymbols '-#' -l 64 -s 1,8\"" << endl;
cerr << " -iupac : \"-A 16 -B 27 -f <TAM30,gc50,kappa1> -b ... -BSymbols 'ACGTRrYySsWwKkMmBbDdHhVvn@N'\""<< endl;
cerr << " \"-M {<iupac 16 x 27 specific matrix>} -l 64 -s 1,4\"" << endl;
exit(-1);
}
/// display a 2d matrix (a vector of vectors of int)
void DISPLAYMATRIX(std::ostream & os, std::vector<std::vector<int> > & matrix ) {
os << "{";
for (int b = 0; b < gv_seed_alphabet_size; b++) {
os << "{";
for (int a = 0; a < gv_align_alphabet_size; a++) {
os << (matrix[a][b]);
if (a < (gv_align_alphabet_size-1)) os << ",";
}
os << "}";
if (b < (gv_seed_alphabet_size-1)) os << ",";
}
os << "}";
}
/// display a 1d vector (vector of int)
void DISPLAYTABLE(std::ostream & os, std::vector<double> & table ) {
os << "{";
for (int a = 0; a < (int)table.size(); a++) {
os << (table[a]);
if (a < (int)(table.size() - 1))
os << ",";
}
os << "}";
}
/// check if the sum of probabilities is ~ 1.0
void CHECKPROB(std::vector<double> & table) {
double result = 0.0;
for (int j = 0; j < (int)table.size(); j++)
result += (table)[j];
if (result < 0.99 || result > 1.01)
_ERROR("CHECKPROB"," sum of " << (table)[0] << " + ... + " << (table)[table.size()-1] << " must be 1.0 and not " << result);
}
/// parse one flag
void PARSEFLAG(int & i, char ** argv, int argc, bool & var, bool value) {
var = value;
}
/// parse and check one integer
void PARSEINT(int & i, char ** argv, int argc, int & var, int min, int max) {
i++;
if (i >= argc)
_ERROR("PARSEINT","\"" << argv[i-1] << "\" found without argument");
char * next = argv[i];
int i_tmp = strtol(argv[i], &next, 10);
if ((*argv[i] == '\0') || ((*next) != '\0'))
_ERROR("PARSEINT","\"" << argv[i-1] << "\" is not followed by an integer");
if (i_tmp>max || i_tmp <min)
_ERROR("PARSEINT","\"" << argv[i-1] << "\" is followed by an integer outside the valid range");
var = i_tmp;
}
/// parse and check one integer or a set of options from an enum
void PARSEINTORENUM(int & i, char ** argv, int argc, int & var, int min, int max, char * enum_string_values[], int enum_string_values_size, bool &enum_set, int &enum_set_value) {
i++;
if (i >= argc)
_ERROR("PARSEINTORENUM","\"" << argv[i-1] << "\" found without argument");
char * next = argv[i];
/* check in allowed strings before */
for (int j = 0; j < enum_string_values_size; j++) {
if (strcmp(enum_string_values[j],next) == 0) {
enum_set = true;
enum_set_value = j;
return;
}
}
/* check in integers in not a string */
int i_tmp = strtol(argv[i], &next, 10);
if ((*argv[i] == '\0') || ((*next) != '\0')) {
cerr << "[*] = (";
for (int j = 0; j < enum_string_values_size; j++) {
if (j)
cerr << ",";
cerr << (enum_string_values[j]);
}
cerr << ")" << endl;
_ERROR("PARSEINTORENUM","\"" << argv[i-1] << "\" is not followed by an integer or a correct value [*]");
}
/* FIXME HERE : more detail */
if (i_tmp > max || i_tmp < min)
_ERROR("PARSEINTORENUM","\"" << argv[i-1] << "\" is followed by an integer outside the valid range");
var = i_tmp;
}
/// parse and check a pair of integers
void PARSEDINT(int & i, char ** argv, int argc,
int & var1, int min1, int max1,
int & var2, int min2, int max2) {
i++;
if (i >= argc)
_ERROR("PARSEDINT","\"" << argv[i-1] << "\" found without argument");
int i_tmp = sscanf(argv[i],"%i,%i",&var1,&var2);
if (i_tmp != 2 )
_ERROR("PARSEDINT","\"" << argv[i-1] << "\" is not followed by an <int>,<int>");
if (var1 < min1 || var1 > max1 || var2 < min2 || var2 > max2)
_ERROR("PARSEDINT","\"" << argv[i-1] << "\" is followed by integers out of the range");
if (var1 > var2)
_ERROR("PARSEDINT","\"" << var1 << "\" must be <= \"" << var2 << "\"");
}
/// parse and check a list of integers
void PARSEINTS(int & i, char ** argv, int argc,
std::vector<int> & r_table, int neededsize = 0, bool minmax=false, int vmin=0, int vmax=1000, bool complete=false) {
i++;
if (i >= argc)
_ERROR("PARSEINTS","\"" << argv[i-1] << "\" found without argument");
// read table
r_table.clear();
char * pch = strtok(argv[i],",;");
while (pch){
int value = 0;
int i_tmp = sscanf(pch,"%d",&value);
if (i_tmp != 1)
_ERROR("PARSEINTS","\"" << pch << "\" is not a correct <int>");
if (minmax) {
if (value < vmin || value > vmax)
_ERROR("PARSEINTS","\"" << value << "\" is by an integer out of the range");
}
r_table.push_back(value);
pch = strtok(NULL,",;");
}
// check table size if needed
if (neededsize != 0) {
if (complete && (int) r_table.size() < neededsize) {
while ((int)r_table.size() < neededsize)
r_table.push_back(r_table.back());
}
if ((int)r_table.size() != neededsize)
_ERROR("PARSEINTS", " there is not a correct number of <int> values (" << (r_table.size()) << ") given by " << argv[i-1] << " when compared with the needed size " << neededsize);
}
}
/// parse and check a double
void PARSEDOUBLE(int & i, char ** argv, int argc, double & var , double min, double max) {
i++;
if (i >= argc)
_ERROR("PARSEDOUBLE","\"" << argv[i-1] << "\" found without argument");
char * next = argv[i];
double d_tmp = strtod(argv[i],&next);
if ((*argv[i] == '\0') || ((*next) != '\0'))
_ERROR("PARSEDOUBLE","\"" << argv[i-1] << "\" is not followed by a double");
if (d_tmp>max || d_tmp <min)
_ERROR("PARSEDOUBLE","\"" << argv[i-1] << "\" is followed by a double outside the valid range");
var = d_tmp;
}
/// parse and check a pair of double
void PARSEDDOUBLE(int & i, char ** argv, int argc,
double & var1, double min1, double max1,
double & var2, double min2, double max2) {
i++;
if (i >= argc)
_ERROR("PARSEDDOUBLE","\"" << argv[i-1] << "\" found without argument");
int i_tmp = sscanf(argv[i],"%lf,%lf",&var1,&var2);
if (i_tmp != 2 )
_ERROR("PARSEDDOUBLE","\"" << argv[i-1] << "\" is not followed by an <dbl>,<dbl>");
if (var1 < min1 || var1 > max1 || var2 < min2 || var2 > max2)
_ERROR("PARSEDDOUBLE","\"" << argv[i-1] << "\" is followed by doubles out of the range");
if (var1 > var2)
_ERROR("PARSEDDOUBLE","\"" << var1 << "\" must be <= \"" << var2 << "\"");
}
/// parse and check a list of double
void PARSEDOUBLES(int & i, char ** argv, int argc,
std::vector<double> & r_table, int neededsize = 0, bool positive_values=false) {
i++;
if (i >= argc)
_ERROR("PARSEDOUBLES","\"" << argv[i-1] << "\" found without argument");
// read table
r_table.clear();
char * pch = strtok(argv[i],",;");
while (pch){
double value = 0.0;
int i_tmp = sscanf(pch,"%lf",&value);
if (i_tmp != 1)
_ERROR("PARSEDOUBLES","\"" << pch << "\" is not a correct <dbl>");
if (positive_values) {
if (value < 0)
_ERROR("PARSEDOUBLES","\"" << value << "\" is a negative <dbl>");
}
r_table.push_back(value);
pch = strtok(NULL,",;");
}
// check table size if needed
if (neededsize != 0) {
if ((int)r_table.size() != neededsize)
_ERROR("PARSEDOUBLES", " there is not a correct number of <dbl> values (" << (r_table.size()) << ") given by " << argv[i-1] << " when compared with the needed size " << neededsize);
}
}
/// parse and check if an options commes from a list of strings (retunr the index of the string)
void PARSESTRING(int & i, char ** argv, int argc,
int & value,
char * values[],
int values_size
) {
i++;
if (i >= argc)
_ERROR("PARSESTRING","\"" << argv[i-1] << "\" found without argument");
for (int j=0; j<values_size; j++) {
if (strcmp(argv[i],values[j]) == 0) {
value = j;
return;
}
}
_ERROR("PARSESTRING","\"" << argv[i-1] << "\" argument \"" << argv[i] << "\" is not accepted by this option");
}
/// check if a signature is compatible with a given weight range (this must be tested when the weight is changed, or the signature changed)
void CHECKSIGNATURE() {
if (gv_weight_interval_flag && gv_signature_flag) {
double signature_weight = 0.0;
for (int b = 0; b < gv_seed_alphabet_size; b++)
signature_weight += gv_signature[b] * gv_bsel_weight[b];
if (signature_weight < gv_minweight)
_ERROR("CHECKSIGNATURE", " the \"-i\" signature computed weight " << signature_weight << " is too small when compared with the current weight range " << gv_minweight << "," << gv_maxweight);
if (signature_weight > gv_maxweight)
_ERROR("CHECKSIGNATURE", " the \"-i\" signature computed weight " << signature_weight << " is too large when compared with the current weight range " << gv_minweight << "," << gv_maxweight);
}
}
/// parse and check a signature (number of seed elements inside a seed)
void PARSESIGNATURE(int & i, char ** argv, int argc, std::vector<int> & r_table) {
i++;
if (i >= argc)
_ERROR("PARSESIGNATURE","\"" << argv[i-1] << "\" found without argument");
// read table
char * pch = strtok(argv[i],",;");
r_table.clear();
while (pch){
int value = 0;
int i_tmp = sscanf(pch,"%d",&value);
if (i_tmp != 1)
_ERROR("PARSESIGNATURE","\"" << pch << "\" is not a correct integer");
if (value < 0 || value >= 64)
_ERROR("PARSESIGNATURE","\"" << pch << "\" is out of the range [0..N] (N=64)");
r_table.push_back(value);
pch = strtok(NULL,",;");
}
// check table size
if ((int)r_table.size() != gv_seed_alphabet_size)
_ERROR("PARSESIGNATURE", " there is not a correct number of <int> values given by " << argv[i-1] << " when compared with the current seed alphabet size B = " << gv_seed_alphabet_size);
// check signature and weight interval
CHECKSIGNATURE();
}
/// parse and check a homogeneous scoring system
void PARSEHOMOGENEOUS(int & i, char ** argv, int argc, std::vector<int> & r_table) {
i++;
if (i >= argc)
_ERROR("PARSEHOMOGENEOUS","\"" << argv[i-1] << "\" found without argument");
// read table
char * pch = strtok(argv[i],",;");
r_table.clear();
while (pch){
int value = 0;
int i_tmp = sscanf(pch,"%d",&value);
if (i_tmp != 1)
_ERROR("PARSEHOMOGENEOUS","\"" << pch << "\" is not a correct integer");
if (value < -1000 || value > 1000)
_ERROR("PARSEHOMOGENEOUS","\"" << pch << "\" is out of the range [-1000..1000]");
r_table.push_back(value);
pch = strtok(NULL,",;");
}
// check table size
if ((int)r_table.size() != gv_align_alphabet_size)
_ERROR("PARSEHOMOGENEOUS", " there is not a correct number of <int> values given by " << argv[i-1] << " when compared with the current align alphabet size A = " << gv_align_alphabet_size);
}
/// parse and check a set of probabilities
void PARSEPROBS(int & i, char ** argv, int argc,
std::vector<double> & r_table, int & k) {
i++;
if (i >= argc)
_ERROR("PARSEPROBS", "\"" << argv[i-1] << "\" found without argument");
// read table
char * pch = strtok(argv[i], ",;");
r_table.clear();
while (pch){
double value = 0.0;
int i_tmp = sscanf(pch, "%lf", &value);
if (i_tmp != 1)
_ERROR("PARSEPROBS","\"" << pch << "\" is not a correct <dbl>");
if (value < 0 || value > 1 )
_ERROR("PARSEPROBS","\"" << pch << "\" is out of the range [0,1]");
r_table.push_back(value);
pch = strtok(NULL,",;");
}
// check table size
int a = gv_align_alphabet_size;
k = 0;
while(1) {
if ((int)r_table.size() == a)
break;
else if ((int)r_table.size() < a)
_ERROR("PARSEPROBS", " there is not a correct number of <dbl> values (" << (r_table.size()) << ") given by " << argv[i-1] << " when compared with the current alphabet size A = " << gv_align_alphabet_size);
a *= gv_align_alphabet_size;
k++;
}
CHECKPROB(r_table);
}
/// parse and check a set of probabilities as an automaton file
void PARSEPROBSAUTOMATONFILE(int & i, char ** argv, int argc, automaton<double> * &r_automaton) {
i++;
if (i >= argc)
_ERROR("PARSEPROBSAUTOMATONFILE","\"" << argv[i-1] << "\" found without argument");
// read file
ifstream ifs_file;
ifs_file.open(argv[i]);
if (!ifs_file){
_ERROR("PARSEPROBSAUTOMATONFILE","unreadable file \"" << argv[i] << "\" ");
}
// read the content and set the automaton
if (r_automaton)
delete r_automaton;
r_automaton = new automaton<double>();
ifs_file >> (*r_automaton);
ifs_file.close();
}
/// parse and check a set of polynomial probabilities as an automaton file
void PARSEMULTIPOLYAUTOMATONFILE(int & i, char ** argv, int argc, automaton<polynomial<BIGINT > > * &r_automaton) {
i++;
if (i >= argc)
_ERROR("PARSEPOLYPROBSAUTOMATONFILE","\"" << argv[i-1] << "\" found without argument");
// read file
ifstream ifs_file;
ifs_file.open(argv[i]);
if (!ifs_file){
_ERROR("PARSEPOLYPROBSAUTOMATONFILE","unreadable file \"" << argv[i] << "\" ");
}
// read the content and set the automaton
if (r_automaton)
delete r_automaton;
r_automaton = new automaton<polynomial<BIGINT > >();
ifs_file >> (*r_automaton);
ifs_file.close();
}
/// set of states on a matrix input @see PARSEMATRIX
typedef enum {
dummy,
readfirstopenbracket,
readsecondopenbracket,
readint,
readintseparator,
readsecondclosedbracket,
readsecondbracketseparator,
readfirstclosedbracket,
readfirstbracketseparator
} automatareadmatrixstates;
/// parse and check a matrix input
void PARSEMATRIX(int & i, char ** argv, int argc, std::vector<std::vector<int> > & matrix, int min, int max, int gnbcolumns, int gnbrows) {
i++;
if (i >= argc)
_ERROR("PARSEMATRIX","\"" << argv[i-1] << "\" found without argument");
char * pch = argv[i];
int nbcolumns = 0, nbrows = 0;
automatareadmatrixstates state = dummy;
// automata
while(*pch) {
switch (*pch){
case '{':
switch (state){
case dummy:
nbrows = 0;
state = readfirstopenbracket;
break;
case readfirstopenbracket:
case readsecondclosedbracket:
case readsecondbracketseparator:
nbcolumns = 0;
nbrows++;
if (nbrows > gnbrows)
_ERROR("PARSEMATRIX","incorrect number of lines : " << nbrows << " != " << gnbrows << " in \"" << argv[i] << "\" string parameter ");
state = readsecondopenbracket;
break;
default:
_ERROR("PARSEMATRIX","incorrect \'" << *pch << "\' in \"" << argv[i] << "\" string parameter ");
}
pch++;
break;
case '}':
switch (state){
case readint:
case readintseparator:
state = readsecondclosedbracket;
if (nbcolumns != gnbcolumns)
_ERROR("PARSEMATRIX","incorrect number of columns : " << nbcolumns << " != " << gnbcolumns << " in \"" << argv[i] << "\" string parameter ");
break;
case readsecondclosedbracket:
state = readfirstclosedbracket;
if (nbrows != gnbrows )
_ERROR("PARSEMATRIX","incorrect number of lines : " << nbrows << " != " << gnbrows << " in \"" << argv[i] << "\" string parameter ");
break;
default:
_ERROR("PARSEMATRIX","incorrect \'" << *pch << "\' in \"" << argv[i] << "\" string parameter ");
}
pch++;
break;
case ' ':
case '\t':
case '\n':
pch++;
break;
case ';':
case ',':
switch (state){
case readint:
state = readintseparator;
break;
case readsecondclosedbracket:
state = readsecondbracketseparator;
break;
case readfirstclosedbracket:
state = readfirstbracketseparator;
break;
default:
_ERROR("PARSEMATRIX","incorrect \'" << *pch << "\' in \"" << argv[i] << "\" string parameter ");
}
pch++;
break;
default: // read value
if (state == readsecondopenbracket || state == readintseparator) {
state = readint;
int value = strtol (pch, &pch, 10);
if (value>max || value<min)
_ERROR("PARSEMATRIX","\"" << argv[i] << "\" contains at least one integer outside the valid range");
if (nbcolumns >= gnbcolumns)
_ERROR("PARSEMATRIX","incorrect number of columns : " << (nbcolumns+1) << ">" << gnbcolumns << " in \"" << argv[i] << "\" string parameter ");
if (nbrows > gnbrows)
_ERROR("PARSEMATRIX","incorrect number of lines : " << nbrows << ">" << gnbrows << " in \"" << argv[i] << "\" string parameter ");
matrix[nbcolumns][nbrows-1] = value;
nbcolumns++;
} else {
_ERROR("PARSEMATRIX","incorrect \'" << *pch << "\' in \"" << argv[i] << "\" string parameter ");
}
}
} // while
if (state == readfirstclosedbracket || state == readfirstbracketseparator)
return;
else
_ERROR("PARSEMATRIX","\" unfinished string parameter \"" << argv[i] << "\" ");
}