-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathsPARTA.py
executable file
·3217 lines (2691 loc) · 136 KB
/
sPARTA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
## sPARTA: small RNA-PARE Target Analyzer public version
## Updated: version-1.27 11/05/2017
## Author: [email protected]
## Author: [email protected]
## Copyright (c): 2016, by University of Delaware
## Contributor : Atul Kakrana
## Affilation : Meyers Lab (Donald Danforth Plant Science Center, St. Louis, MO)
## License copy: Available at https://opensource.org/licenses/Artistic-2.0
#### PYTHON FUNCTIONS ##############################
import sys,os,re,time,glob,shutil,operator,datetime,argparse,importlib,getpass
import subprocess, multiprocessing
from multiprocessing import Process, Queue, Pool
from operator import itemgetter
from os.path import expanduser
#### USER SETTINGS ################################
parser = argparse.ArgumentParser()
parser.add_argument('-annoType', default='', help='GFF if the annotation file'\
'file is a GFF file. GTF if the annotation file is a GTF file')
parser.add_argument('-annoFile', default='', help='GFF/GTF file for the '\
'species being analyzed corresponding to the genome assembly being used')
parser.add_argument('-genomeFile', default='', help='Genome file in FASTA '\
'format')
parser.add_argument('-featureFile', default='', help='Feature file in FASTA '\
'format')
parser.add_argument('-genomeFeature', required=True, help='0 if prediction is '\
'to be done in genic region. 1 if prediction is to be done in intergenic '\
'region')
parser.add_argument('-miRNAFile', default='', help='FASTA format of miRNA '\
'sequences')
parser.add_argument('-tarPred', nargs='?', const='H', help='Mode of target '\
'prediction. H for heuristic. E for exhaustive. H is default if no mode '\
'is specified')
parser.add_argument('-tarScore', nargs='?', const='S', help='Scoring mode '\
'for target prediction. S for seedless. N for normal. S is default if '\
'no mode is specified')
parser.add_argument('-libs', nargs='*', default=[], help='List of PARE '\
'library files in tag count format. Data can be converted into tag '\
'count format using')
parser.add_argument('-minTagLen', default=20, type=int, help='Minimum length '\
'of PARE tag. Tags shorter than minTagLen will be discarded. 20 is '\
'default')
parser.add_argument('-maxTagLen', default=30, type=int, help='Maximum length '\
'of PARE tag. Tags longer than maxTagLen will be chopped to the specified '\
'length. 30 is default')
parser.add_argument('--tag2FASTA', action='store_true', default=False, help=
'Convert tag count file for PARE libraries to FASTA files for mapping')
parser.add_argument('--map2DD', action='store_true', default=False, help=
'Map the PARE reads to feature set')
parser.add_argument('--validate', action='store_true', default=False, help=
'Flag to perform the validation of the potential cleave sites from '\
'miRferno')
parser.add_argument('--repeats', action='store_false', default=True, help=
'Flag to include PARE reads from repetitive regions')
parser.add_argument('--noiseFilter', action='store_false', default=True,
help='Flag to include all PARE validations with p-value of <=.5, '\
'irrespective of the noise to signal ratio at cleave site and category '\
'of PARE read')
parser.add_argument('-accel', default='Y', help='Y to use '\
'balanced multiple process scheme or else specify the number of '\
'processors to be used. Y is default')
parser.add_argument('--standardCleave', action='store_true', default=False,
help='Flag to use standard cleave locations (10th, 11th and 12th '\
'positions), or rules more specific to miRNA size')
parser.add_argument('--viewerdata', action='store_true', default=False,
help='Flag to prepare additional SAM files for mapped PARE reads and '\
'BED file validated targets from all libraries combined')
### Developer Options ###
parser.add_argument('--generateFasta', action='store_false', default=False,
help=argparse.SUPPRESS)
parser.add_argument('--fileFrag', action='store_true', default=False,
help=argparse.SUPPRESS)
parser.add_argument('--indexStep', action='store_true', default=False,
help=argparse.SUPPRESS)
parser.add_argument('-splitCutoff', default=20, help=argparse.SUPPRESS)
parser.add_argument('-maxHits', default=30, help=argparse.SUPPRESS)
parser.add_argument('--cat4Show', action='store_false', default=True,
help=argparse.SUPPRESS)
args = parser.parse_args()
### Various checks for dependencies within command line arguments
# If either annotation or genome file is given without the other and
# featureFile is not given, exit.
if(((args.annoFile and not args.genomeFile) or (args.genomeFile and not
args.annoFile)) and (not args.featureFile)):
print("annoFile and genomeFile both must be provided to extract seqeunces")
exit()
# If annoType is provided and not GFF or GTF, report the error and exit
if(args.annoType and args.annoType != 'GFF' and args.annoType != 'GTF'):
print("annoType must be either GFF or GTF")
exit()
# If either the annotation file or annotation type is given without the other,
# exit.
if((args.annoType and not args.annoFile) or (args.annoFile and not
args.annoType)):
print("annoType and annoFile must both be give to parse either the GFF "\
"or GTF file.")
exit()
# If the user input both a genome and feature file, exit as both cannot be
# supplied for proper execution
if(args.genomeFile and args.featureFile):
print("genomeFile and featureFile cannot both be supplied for execution")
exit()
# If annoFile and genomeFile are given turn on extraction, frag and index steps
# must be set on
if(args.annoFile and args.genomeFile):
args.generateFasta = True
args.fileFrag = True
args.indexStep = True
# If featureFile is given, frag and index steps must be set on
if(args.featureFile):
# If featureFile is given and annoFile is given, give a warning letting
# user know the annoFile will be ignored and the input fasta file may
# have been intended as a genomeFile
if(args.annoFile):
print("Warning: You have input a annoFile but input a FASTA file as "\
"the featureFile. If you intended for this to be used in conjunction "\
"with the annotation file to create a feature file, please press "\
"'ctrl+c' to cancel the execution and rerun with the FASTA file "\
"under the argument 'genomeFile'. If this is in fact the feature "\
"file, allow sPARTA to continue its execution.")
time.sleep(10)
args.fileFrag = True
args.indexStep = True
# If indexStep is on and tarPred is off, turn tarPred and tarScore on
if(args.indexStep):
if(not args.tarPred):
args.tarPred = 'H'
if(not args.tarScore):
args.tarScore = 'S'
# If tarPred is on, then tarScore will default to S
if(args.tarPred and not args.tarScore):
args.tarScore = 'S'
# If tarPred is on, then miRNAFile must be provided
if(args.tarPred and not args.miRNAFile):
print("miRNA file must be given to perform target prediction")
exit()
# If tag2FASTA is on, turn map2DD on
if(args.tag2FASTA and not args.map2DD):
args.map2DD = True
# If tag2FASTA is on, then libraries must be defined
if(args.tag2FASTA and not args.libs):
print("libs must be assigned to perform tag2FASTA")
exit()
# If validate is on, then libraries must be input
if(args.validate and not args.libs):
print("At least one library must be given to perfor the validate")
exit()
# genomeFeature must be an integer
args.genomeFeature = int(args.genomeFeature)
####################################################################
#### sPARTA FUNCTIONS ##############################################
def checkUser():
'''
Checks if user is authorized to use script
'''
print ("\n#### Fn: Checking user #################################")
auser = getpass.getuser()
localtime = time.asctime( time.localtime(time.time()) )
# print ("Local current time :", localtime)
print("Hello '%s' - %s" % (auser,localtime))
print("You can seek help or report issues at: https://github.com/atulkakrana/sPARTA/issues" )
# if auser in allowedUser:
# print("Hello '%s' - Issues need to be reproted: https://github.com/atulkakrana/phasTER/issues \n" % (auser))
# else:
# print("YOU ARE NOT AUTHORIZED TO USE DEVELOPMENT VERSION OF 'sPARTA'")
# print("Contact 'Atul Kakrana' at [email protected] for permission\n")
# sys.exit()
return None
def checkLibs():
'''Checks for required components on user system'''
print("\n#### Fn: Check libraries and components ################")
goSignal = 1
# isRpy2 = importlib.find_loader('rpy2')
# if isRpy2 is None:
# print("--rpy2 : missing")
# goSignal = 0
# else:
# print("--rpy2 : found")
# pass
isNumpy = importlib.find_loader('numpy')
if isNumpy is None:
print("--numpy : missing")
goSignal = 0
else:
print("--numpy : found")
pass
isScipy = importlib.find_loader('scipy')
if isScipy is None:
print("--scipy : missing")
goSignal = 0
else:
print("--scipy : found")
pass
# isPyfasta = importlib.find_loader('pyfasta')
# if isPyfasta is None:
# print("--pyfasta: missing")
# goSignal = 0
# else:
# print("--pyfasta: found")
# pass
if goSignal == 0:
print("\nPlease install missing libraries before running the analyses")
print("See README for how to install these")
print("sPARTA has unmet dependendies and will exit for now\n")
print("You can seek help or report issues at: https://github.com/atulkakrana/sPARTA/issues")
sys.exit()
# isTally = shutil.which("tally")
# if isTally:
# print("Found:Tally")
# pass
# else:
# print("Please install 'Tally' before using the tool")
# print("See README for how to INSTALL")
# sys.exit()
return None
def checkfiles():
'''
pre-analysis check for files in working directory
'''
print("\n#### Fn: Check input files #############################")
goSignal = 1
##
if args.featureFile:
## Check for feature file
if not os.path.isfile(args.featureFile):
# print("'%s' genome file not found at:%s" % (genomeFile,os.getcwd()))
print("--featureFile : missing")
goSignal = 0
else:
print("--featureFile : found")
pass
if args.miRNAFile:
## check for miRNA file
if not os.path.isfile(args.miRNAFile):
# print("'%s' file not found at:%s" % (args.miRNAFile,os.getcwd()))
# print("Please check if miRNA fasta file exists in your directory\n")
print("--miRNAFile : missing")
goSignal = 0
else:
print("--miRNAFile : found")
pass
if args.annoFile:
## Check for annotation file
if not os.path.isfile(args.annoFile):
# print("'%s' file not found at:%s" % (args.annoFile,os.getcwd()))
# print("Please check if GTF/GFF3 file exists in your directory\n")
print("--annoFile : missing")
goSignal = 0
else:
print("--annoFile : found")
pass
if args.genomeFile:
## Check for genome file
if not os.path.isfile(args.genomeFile):
# print("'%s' genome file not found at:%s" % (genomeFile,os.getcwd()))
print("--genomeFile : missing")
goSignal = -1
else:
print("--genomeFile : found")
pass
if args.libs:
## Check for PARE libs
for alib in args.libs:
if not os.path.isfile(alib):
# print("'%s' file not found at:%s" % (alib,os.getcwd()))
# print("Please check if GTF/GFF3 file exists in your directory\n")
print("--PARE lib(s) : missing")
goSignal = -2
else:
print("--PARE lib(s) : found")
pass
if goSignal == 0:
print("\n** Some input file(s) couldn't be located !!")
print("** sPARTA expects all input file to be in same directory with sPARTA program")
print("** Please copy the missing input files in '%s' directory and rerun sPARTA" % (os.getcwd()))
print("** sPARTA will exit for now \n")
print("You can seek help or report issues at: https://github.com/atulkakrana/sPARTA/issues")
sys.exit()
elif goSignal == -1:
print("\n** Genome FASTA file couldn't be located !!")
print("** sPARTA expects all input file to be in same directory with sPARTA program")
print("** Please copy the genome FASTA file in '%s' directory and rerun sPARTA" % (os.getcwd()))
print("** Make sure that your genome FASTA has no digits/numbers in filename, otherwise it will be deleted in cleanup operation")
print("** sPARTA will exit for now \n")
print("You can seek help or report issues at: https://github.com/atulkakrana/sPARTA/issues")
sys.exit()
elif goSignal == -2:
print("\n** All PARE/Degradome libraries couldn't be located !!")
print("** sPARTA expects all libraries in the same directory with sPARTA program")
print("** At least one of the input library is missing from '%s' directory" % (os.getcwd()))
print("** Please check and and rerun sPARTA")
print("** sPARTA will exit for now \n")
print("You can seek help or report issues at: https://github.com/atulkakrana/sPARTA/issues")
sys.exit()
else:
## Good to go !!
pass
return None
def genomeReader(genomeFile):
'''
Reads Genome FASTA file
'''
print("\n#### Fn: genomeReader ##################################")
#### Check for genome file
if not os.path.isfile(genomeFile):
print("'%s' genome file not found at:%s" % (genomeFile,os.getcwd()))
print("\nPlease check the genomeFile - Is it in sPARTA directory? Did you input wrong name?")
print("If it exists and you input correct name then it could have been deleted while last cleanup operation - Sorry!!")
print("If deleted then please rename genomeFile (without any integers in name) and re-copy in sPARTA folder\n")
print("Script will exit for now\n")
sys.exit()
else:
fh_in = open(genomeFile, 'r')
pass
print("Caching genome fasta")
genomeFile = fh_in.read()
genomeList = genomeFile.split('>')[1:]
chromoDict = {}
chromoLenDict = {}
for i in genomeList:
chromoInfo = i.partition('\n')
chrid = chromoInfo[0].split()[0]
chrSeq = chromoInfo[2].replace("\n", "").strip()
# print ("head",chrSeq[:20].strip())
# print ("excise",chrSeq[9:50].strip())
# print ("tail",chrSeq[-20:].strip())
# sys.exit()
chromoDict[chrid] = chrSeq ## Bug 05-12-17a - The value for chromosome dictionary was being stored as list here, and list convetred to string in EXTRACTFASTA1, therefore adding [' to 5'-end and '] to 3'-end - Fixed
chromoLenDict[chrid] = int(len(chrSeq)) ## To convert coords to python format subtract 1nt - reverted in v1.25 as no such offset required for length
print("Genome dict prepared for %s chromosome/scaffolds" % (len(chromoDict)))
return chromoDict,chromoLenDict
def gtfParser(gtfFile):
'''This function parses Trinity and Rocket GTF file
to give a trascript entry and entry for all exons - Basically this is the parser for gffread geenrated files'''
print("\n#### Fn: gtfParser #####################################")
#### Check if file exists
if not os.path.isfile(gtfFile):
print("'%s' file not found at:%s" % (gtfFile,os.getcwd()))
print("Please check if GTF file exists in your directory\n")
sys.exit()
else:
pass
### Read GTF ####
mode = 3 ## This is the mode from real module that corresponts to TopHat/Cufflink GTF files
with open(gtfFile) as fh_in:
lines = (line.rstrip() for line in fh_in)
gtfRead = list(line for line in lines if line) # Non-blank lines in a list
fh_in.close()
### PARSE GTF file #####
gtfList = [] ## List to hold parsed GTF entries
tempName = [] ## Stores current trans name
tempCoords = [] ## Temp coords
for i in gtfRead:
# print(i)
ent = i.split("\t")
# print("\nEnt",ent)
gScore = ent[5] ## Score provided for mapping accuracy in Trinity GTF from GMAP 0 to 100
gtype = ent[2]
if gtype == "exon": #### <exon/CDS> Use CDS to get similar number of transcripts as in GFF3 file - but CDS doesn't include UTRs so exon is choosen to represent protein-coding genes
# print("\nExon:",ent)
gchr = ent[0]
gstart = int(ent[3])
gend = int(ent[4])
gstrand = ent[6].translate(str.maketrans("+-","wc"))
info = ent[8].strip("\n").split(";")
# print(info)
gid = info[1].split()[1].replace('"','') ## Gene ID
# print('-',gid,gchr,gstart,gend,gstrand)
## Parse the info and add the exon entries #################
#############################################################
# if mode == 2 or (mode == 4 and len(info)==4): ## This is trinity GTF info
# ## Protein coding gene with a version number
# tid = info[0].split()[1].replace('"','').split(".")[0] ## Transcript ID
# gid = info[2].split()[1].replace('"','').rpartition("_")[0] ## Gene ID
# # aflag = 'T' ## Trinity
# print('-',gid,gchr,gstart,gend,gstrand,geneType)
# gtfList.append((gid,tid,gchr,gstart,gend,gstrand,geneType))
# elif mode == 3 or (mode ==4 and len(info) >= 7): ## This is rocket GTF info, with and w/o p_id "P1" in the end
# tid = info[0].split()[1].replace('"','') ## Transcript ID
# gid = info[1].split()[1].replace('"','') ## Gene ID
# # aflag = 'R' ## Rocket
# geneType = 'gene'
# # print('-',gid,gchr,gstart,gend,gstrand,geneType)
# gtfList.append((gchr,gstrand,gid,gstart,gend,geneType)) ###@@@@@@@@@@@@@@@@@@@ CHECK @@@@@@@@@@@@@@@@@@
# else:
# print("-This entry has more than expected info")
# print("-Info block",info)
# print("-Info block len:%s" % len(info))
# print("-Debug!!")
# sys.exit()
## Check trascript change #########
###################################
## Check if transcript entry needs to be added i.e. all the exons from previous one has been captured
if not tempName or tempName[0] == gid:
tempName.append(gid)
tempCoords.append(gstart)
tempCoords.append(gend)
tempStrand = gstrand ## Strand of last transcript
tempgid = gid ## Gene name of last transcript
tempChr = gchr ## Chr of last transcript
elif tempName[0] != gid: ## Check if the new transcript is being read. if yes then add transcript entry for old exons using tempName and tempCoords
# print("-New transcript read - summarizing transcript entry for earlier one")
gstartProcess = min(tempCoords)
gendProcess = max(tempCoords)
gidProcess = tempName[0]
geneType = 'gene'
# print('-',tempChr,tempStrand,gidProcess,gstartProcess,gendProcess,geneType)
gtfList.append((tempChr,tempStrand,gidProcess,int(gstartProcess),int(gendProcess),geneType)) ### @@@@@@@@@@@@@@@@@ CHECK @@@@@@@@@@@@@@@@@@@@@@ gene start and gene end
## Empty lists and fill with current exon from new transcript
tempName = [] ## Empty gene names
tempCoords = [] ## Empty exon coords from earlier gene
tempName.append(gid) ## Add new gene name
tempCoords.append(gstart) ## Add exons coords for new gene
tempCoords.append(gend) ## Add exons coords for new gene
tempStrand = gstrand ## Strand of last transcript
tempgid = gid ## Gene name of last transcript
tempChr = gchr ## Chr of last transcript
# sys.exit()
else:
print("-Unforseen scenario encountered")
print("There seems to be a problem with GTF file - Please check integrity of your GTF file")
sys.exit()
else:
# print("We don't need this info for current script") ## CDS or GENE
pass
if len(gtfList) == 0:
print("Check if the feature used for extraction i.e. gene, transcript, exon is correct")
print("Debug!!")
sys.exit()
else:
# print("Total entries fetched from GTF file:%s" % (str(len(gtfList))))
# print("First 10 entries of gtfList list: %s" % (gtfList[:10]))
pass
### Sort entries to prepare for extraction
genome_info = list(gtfList)
# genome_info_sorted = sorted(gtfList, key=operator.itemgetter(0,1,3)) ## CHR_ID, strand and start
genome_info.sort(key=lambda k:(k[0],k[1],k[3])) #
genome_info_inter = genome_info
print("Entries in genome_info:%s" % (len(genome_info)))
print("First 10 entries of genome_info list: %s" % (genome_info[:10]))
# sys.exit()
return genome_info,genome_info_inter
def gffParser(gffFile):
'''
Extract coordinates from GFF file. tested with GFF3 file from phytozome
'''
print("\n####Fn: gffParser ######################################")
#### Check for GFF file
if not os.path.isfile(gffFile):
print("'%s' file not found at:%s" % (gffFile,os.getcwd()))
print("Please check if GFF file exists in your directory\n")
sys.exit()
else:
pass
#### Parse
fh_in = open(gffFile,'r')
# fh_in.readline() ## GFF version - Fixes bug 05-12-17b - First entry was being missed, any line with # is skipped below
gffRead = fh_in.readlines()
genome_info = []
chromo_dict = {}
for i in gffRead:
if i.rstrip(): ## If line not empty
ent = i.strip('\n').split('\t')
# print (ent)
if i.startswith("#"): ## Fixes Bug: 010115
pass
else:
if ent[2] == 'gene':
chrID = ent[0]
strand = ent[6].translate(str.maketrans("+-","wc"))
geneName = ent[8].split(';')[0].split('=')[1]
geneType = 'gene'
# print(chrID,strand,geneName,ent[3],ent[4],geneType)
genome_info.append((chrID,strand,geneName,int(ent[3]),int(ent[4]),geneType))
#genome_info_inter = genome_info ##
# genome_info_sorted = sorted(genome_info, key=operator.itemgetter(0,1,3)) ## CHR_ID, strand and start
genome_info.sort(key=lambda k:(k[0],k[1],k[3])) #
genome_info_inter = genome_info
print("Entries in genome_info:%s" % (len(genome_info)))
# print("First 10 entries of genome_info list: %s" % (genome_info[:10]))
# sys.exit()
fh_in.close()
return genome_info,genome_info_inter
def extractFeatures(genomeFile,chromoDict,chromoLenDict,genome_info,genome_info_inter):
'''
extract coordinates of genes and intergenic regions
'''
print("\n#### Fn: extractFeatures ###############################")
alist = []##
for i in range(0, int(len(genome_info))+1): #
# print (i)
gene1 = (genome_info[i])
gene2 = (genome_info[i+1])
gene_type = 'inter' #
achr = gene1[0]
bchr = gene2[0]
# print(gene1,gene2)
# sys.exit()
if gene1[3] == gene2[3] and gene1[4] == gene2[4]:
## Gene is same/overlapping consider next gene
pass
else:
if tuple(gene1[0:2]) not in alist:#
print ('--Caching gene coords for chromosome: %s and strand: %s' % (gene1[0], gene1[1]))
# print(gene1,gene2)
alist.append((gene1[0:2]))
inter_start1 = 1
inter_end1 = int(gene1[3])-1#
## If both the genes belong to same chr and strand i.e. chromosome has atleast two genes
if gene1[0] == gene2[0] and gene1[1] == gene2[1]:
inter_start2 = int(gene1[4])+1 ## From end of first gene of chromosome
inter_end2 = int(gene2[3])-1 ## Till start of second gene
if gene1[1] == 'w': ## The gene is on positive strand so upstream
inter_name1 = ('%s_up' % (gene1[2]))
inter_name2 = ('%s_up' % (gene2[2]))
else: ## Gene is on negative strand
inter_name1 = ('%s_down' % (gene1[2]))
inter_name2 = ('%s_up' % (gene1[2]))
# print(gene1[0],gene1[1],inter_name1,inter_start1,inter_end1,gene_type)
# print(gene2[0],gene2[1],inter_name2,inter_start2,inter_end2,gene_type)
genome_info_inter.append((gene1[0],gene1[1],inter_name1,inter_start1,inter_end1,gene_type))
genome_info_inter.append((gene2[0],gene2[1],inter_name2,inter_start2,inter_end2,gene_type))
## If the "first two" genes are not from same strand i.e. there is only one gene on this chromosome and strand
else: ## Intergenic from end of chromosome/scaffold
inter_start2 = int(gene1[4])+1 ## From end of first gene of chromosome
# inter_end2 = '-' ### Till end of chromosome
try:
inter_end2 = int(chromoLenDict[achr]) ## Till end of chromosome
except KeyError:
print("** '%s' is either missing from FASTA file or" % (achr))
print("** the name does not matches with the GFF/GTF file")
print("**Check if the '%s' exists in your genome FASTA file" % (achr))
print("**If '%s' is present in your genome FASTA file then make sure the names matches\n" % (achr))
sys.exit()
if gene1[1] == 'w': ##The gene is on positive strand so upstream
inter_name1 = ('%s_up' % (gene1[2]))
inter_name2 = ('%s_down' % (gene1[2]))
else: ##Its on negative strand
inter_name1 = ('%s_down' % (gene1[2]))
inter_name2 = ('%s_up' % (gene1[2]))
# print(gene1[0],gene1[1],inter_name1,inter_start1,inter_end1,gene_type)
# print(gene1[0],gene1[1],inter_name2,inter_start2,inter_end2,gene_type)
genome_info_inter.append((gene1[0],gene1[1],inter_name1,inter_start1,inter_end1,gene_type))
genome_info_inter.append((gene1[0],gene1[1],inter_name2,inter_start2,inter_end2,gene_type))
else:
if gene1[0] == gene2[0] and gene1[1] == gene2[1]: ### If chr_id and strands are same than find intergenic.
inter_start = int(gene1[4])+1
inter_end = int(gene2[3])-1
if gene2[1] == 'w': #
inter_name = ('%s_up' % (gene2[2]))
else:
## reverse strand
inter_name = ('%s_up' % (gene1[2]))
# print(gene2[0],gene2[1],inter_name,inter_start,inter_end,gene_type)
genome_info_inter.append((gene2[0],gene2[1],inter_name,inter_start,inter_end,gene_type))
else:
inter_start = int(gene1[4])+1#
# inter_end = '-' ## Different from MPPP as no table ro query for end of chromosome in public version
try:
inter_end2 = int(chromoLenDict[achr]) ## Till end of chromosome
except KeyError:
print("** '%s' is either missing from FASTA file or" % (achr))
print("** the name does not matches with the GFF/GTF file")
print("**Check if the '%s' exists in your genome FASTA file" % (achr))
print("**If '%s' is present in your genome FASTA file then make sure the names matches\n" % (achr))
sys.exit()
if gene1[1] == 'w':
inter_name = ('%s_down' % (gene1[2]))
else:
inter_name = ('%s_up' % (gene1[2]))
# print(gene1[0],gene1[1],inter_name,inter_start,inter_end2,gene_type)
genome_info_inter.append((gene1[0],gene1[1],inter_name,inter_start,inter_end2,gene_type)) ##Chr_id, strand
## Additional check for scaffolded genomes, if there are no genes in a scaffold it's whole seqeunce will be fetched as intergenic
if args.genomeFeature == 1:
for i in chromoDict.keys():
alen = len(chromoDict[i])
# print("Chr:%s | Length:%s" % (i,alen))
if tuple((i,'c')) in alist:
# print("Found")
# sys.exit()
pass
else:
# print("Get the tuple")
inter_name = ('chr%s_c_all' % (i))
genome_info_inter.append((i,'c',inter_name,1,alen,'inter')) ## Chr_id, strand, name, start, stop, length
if tuple((i,'w')) in alist:
# print("Found")
# sys.exit()
pass
else:
# print("Get the tuple")
inter_name = ('chr%s_w_all' % (i))
genome_info_inter.append((i,'w',inter_name,1,alen,'inter')) ## Chr_id, strand, name, start, stop, length
## Sort the list after adding intergenic regions on on basis of chr_id and strand that is essential while caching chromosme during slicing sequences
genome_info_inter_sort = sorted(genome_info_inter, key=operator.itemgetter(0,1))
gene_coords_file = './coords'
coords_out = open(gene_coords_file, 'w')
coords = [] ## List to hold extracted coords
if args.genomeFeature == 2: ## Both gene and inter
for ent in genome_info_inter_sort:
print(ent)
if ent[4] == '-': ## End of chromosome
coords.append(ent[0:])
coords_out.write('%s,%s,%s,%s,%s,%s\n' % (ent[0:]))
elif int(ent[4])-int(ent[3]) > 25:
coords.append(ent[0:])
coords_out.write('%s,%s,%s,%s,%s,%s\n' % (ent[0:]))
else:
if args.genomeFeature == 0:
genomeFilter = 'gene'
elif args.genomeFeature == 1:
genomeFilter = 'inter'
else:
pass
for ent in genome_info_inter_sort:
if (ent[5] == genomeFilter):
#print(ent)
if ent[4] == '-': ## End of chromosome
coords.append(ent[0:])
coords_out.write('%s,%s,%s,%s,%s,%s\n' % (ent[0:]))
elif int(ent[4])-int(ent[3]) > 25:
coords.append(ent[0:])
coords_out.write('%s,%s,%s,%s,%s,%s\n' % (ent[0:]))
else:
## Too short to use for analysis
pass
print ("Number of coords in 'coords' list: %s" % (len(coords)))
coords_out.close()
return coords
def getFASTA1(genomeFile,coords,chromoDict):
'''
Extracts Genes or intergenic regions based on coords list from genome seqeunce - New proposed name get features
'''
print("\n#### Fn: getFASTA ######################################")
fastaOut = './genomic_seq.fa'
fh_out = open(fastaOut, 'w')
print("Fetching genes or intergenic regions")
fastaList = [] ## Stores name and seq for fastFile
chromo_mem = []
for i in coords: ## Coords is list from annotation parser
#print (i)
chr_id = i[0]
strand = i[1]
gene = i[2]
start = i[3] - 1
end = i[4]
# print('\nGene:%s | Start:%s End:%s Chr:%s Strand:%s' % (gene,start,end,chr_id,strand))
if tuple(i[0:2]) not in chromo_mem:
chromo_mem.append(tuple(i[0:2])) ## First entry of chromosome
print("--Reading chromosome: %s and strand: %s" % (i[0],i[1]) )
# print('--Gene:%s | Start:%s End:%s Chr:%s Strand:%s' % (gene,start,end,chr_id,strand))
# print("-- Fetching gene:%s" % (gene))
chrKey = i[0]
try:
chromo = str(chromoDict[chrKey]) ## Bug 05-12-17a - The value for chromosome dictionary was a list, being convetred to string, therfore adding [' to 5'-end and '] to 3'-end - Fixed
# print ("head",chromo[:20].strip())
# print ("excise",chromo[9:50].strip())
# print ("tail",chromo[-20:].strip())
except KeyError:
print("** '%s' is either missing from FASTA file or" % (achr))
print("** the name does not matches with the GFF/GTF file")
print("**Check if the '%s' exists in your genome FASTA file" % (achr))
print("**If '%s' is present in your genome FASTA file then make sure the names matches\n" % (achr))
sys.exit()
if end == '-':
gene_seq = chromo[start:].translate(str.maketrans("autgcn","AUTGCN")) ## Usually first entry (recorded in this loop) will be from chr-start to gene-start, but if this coord/region is few nts (<20nt)
## it is skipped, and you will encouter directly the gene-end to chr-end region, and you need this loop - Very rare case
else:
gene_seq = chromo[start:end].translate(str.maketrans("autgcn","AUTGCN"))
ncount = gene_seq.count('N') ### Check of 'Ns' that cause bowtie to hang for a very long time. This is observed in chickpea genome.
if ncount < len(gene_seq):
if strand == 'c':
gene_seq_rev = gene_seq[::-1].translate(str.maketrans("TAGC","ATCG"))
fh_out.write('>%s\n%s\n' % (gene,gene_seq_rev))
fastaList.append((gene,gene_seq_rev))
else:
fh_out.write('>%s\n%s\n' % (gene,gene_seq))
fastaList.append((gene,gene_seq))
elif end == '-':
# print('--Gene:%s | Start:%s End:%s Chr:%s Strand:%s' % (gene,start,end,chr_id,strand))
# print("--Fetching gene:%s" % (gene))
gene_seq = chromo[start:].translate(str.maketrans("autgcn","AUTGCN")) ##
ncount = gene_seq.count('N') ### Check of 'Ns' that cause bowtie to hang for a very long time. This is observed in chickpea genome.
if ncount < len(gene_seq):
if strand == 'c':
gene_seq_rev = gene_seq[::-1].translate(str.maketrans("TAGC","ATCG"))
fh_out.write('>%s\n%s\n' % (gene,gene_seq_rev))
fastaList.append((gene,gene_seq_rev))
else:
fh_out.write('>%s\n%s\n' % (gene,gene_seq))
fastaList.append((gene,gene_seq))
else:
# print('--Gene:%s | Start:%s End:%s Chr:%s Strand:%s' % (gene,start,end,chr_id,strand))
# print("--Fetching gene:%s" % (gene))
gene_seq = chromo[start:end].translate(str.maketrans("autgcn","AUTGCN")) ##
ncount = gene_seq.count('N') ### Check of 'Ns' that cause bowtie to hang for a very long time. This is observed in chickpea genome.
if ncount < len(gene_seq):
if strand == 'c':
gene_seq_rev = gene_seq[::-1].translate(str.maketrans("TAGC","ATCG"))
fh_out.write('>%s\n%s\n' % (gene,gene_seq_rev))
fastaList.append((gene,gene_seq_rev))
else:
fh_out.write('>%s\n%s\n' % (gene,gene_seq))
fastaList.append((gene,gene_seq))
# ## Test extracted coords - IDs correspond to ChickPea genome
# if gene == "Ca_99998":
# print('\nGene:%s | Start:%s End:%s Chr:%s Strand:%s' % (gene,start,end,chr_id,strand))
# print(gene_seq_rev)
# elif gene == "Ca_99999":
# print('\nGene:%s | Start:%s End:%s Chr:%s Strand:%s' % (gene,start,end,chr_id,strand))
# print(gene,gene_seq)
# sys.exit()
# else:
# pass
time.sleep(3)
fh_out.close()
# sys.exit()
return fastaOut,fastaList
def fastaReader(fastaFile):
'''Cleans FASTA file - multi-line fasta to single line, header clean, empty lines removal'''
print("\nFn - fastaReader")
## Read seqeunce file
print ('+Reading "%s" FASTA file' % (fastaFile))
fh_in = open(fastaFile, 'r')
fasta = fh_in.read()
fasta_splt = fasta.split('>')
acount = 0 ## count the number of entries
empty_count = 0
fastaList = [] ## Stores name and seq for fastFile
# fastaDict = {}
acount +=1
for i in fasta_splt[1:]:
acount +=1
ent = i.split('\n')
name = ent[0].split()[0].strip()
seq = ''.join(x.strip() for x in ent[1:]) ## Sequence in multiple lines
# alen = len(seq)
if seq:
fastaList.append((name,seq))
# fastaDict[name] = seq
print("--Total entries in phased fastaFile:%s" % (str(acount)))
print("--fastaList generated with %s entries\n" % (str(len(fastaList)))) ## Does not counts an empty enry from split
return fastaList
def fragmentor(FASTA,fastaList,nseq,nfrags):
'''
fragments and writes frags
'''
aslice = int(nseq/nfrags) ## not round to float, float added to remainder
remainder = nseq%nfrags
print("--Number of seqeunces:%s | required fragments:%s | computed splice:%s | remainder seqeunces:%s" % (nseq,nfrags,aslice,remainder))
sliceStart = 0 ## Keep track of slice end
sliceEnd = aslice
acount = 0
for x in range (int(nfrags+1)): ## Extra one for remainder
if x < 10:
afile = "%s.frag.0%s.fa" % (FASTA.rpartition('.')[0],x)
fh_out = open(afile,'w')
else:
afile = "%s.frag.%s.fa" % (FASTA.rpartition('.')[0],x)
fh_out = open(afile,'w')
if x < nfrags: ## '<' because the 'x' starts from zero and goes till nfrags-1. if nfrags = 30, x will go from 0 to 29. The last bin (nfrag+1) will have value 30 and is captured below
## Slice list
print("--Fragment:%s | sliceStart:%s | sliceEnd:%s" % (acount,sliceStart,sliceEnd))
alist = fastaList[sliceStart:sliceEnd]
#write fasta
for ent in alist:
name = ent[0].split()[0].strip()
seq = ''.join(x.strip() for x in ent[1:]) ## Sequence in multiple lines
fh_out.write('>%s\n%s\n' % (name,seq))
## Slice update
acount += 1
sliceStart += aslice
sliceEnd += aslice
else: ## Write remaineder seqeunces in last file
print("--Fragment:%s | sliceStart:%s | sliceEnd: Till end" % (acount,sliceStart))
alist = fastaList[sliceStart:]
#write fasta
for ent in alist:
name = ent[0].split()[0].strip()
seq = ''.join(x.strip() for x in ent[1:]) ## Sequence in multiple lines
fh_out.write('>%s\n%s\n' % (name,seq))
acount += 1
fh_out.close()
return None
def fragFASTA(FASTA,fastaList):
print('Fn - fragFASTA')
## Purge fragmented files from earlier run
shutil.rmtree('./genome', ignore_errors=True)
os.mkdir('./genome')
pattern = ".*\.frag\.[0-9].*\.fa" #
print ("+Purging older files")
for file in os.listdir():
if re.search(pattern,file):
print ('--%s is being deleted' % (file))
os.remove(file)
## Compute nfrags and seq. per frag user FASTA file ######
statInfo = os.stat(FASTA)
filesize = round(statInfo.st_size/1048576,2)
print('\n+Input FASTA size: %sMB**' % (filesize))##
if filesize <= args.splitCutoff: ##
fls = []
fls.append(FASTA)
print ('--No fragmentation performed for file %s' % (fls))
else:
# fh_in = open(FASTA, 'r')
# seq_count = fh_in.read().count('>')
nseq = len(fastaList)
print('--Number of headers in file: %s'% (nseq))
#if genome == 'N':
if nseq >= 30: #
if filesize <= 3072:
nfrags = int(args.maxHits)
elif filesize > 3072 and filesize <= 5120:
nfrags = int(round(args.maxHits*1.25))
else:
nfrags = int(round(args.maxHits*1.5))
print ("--Size based fragmentation in process for '%s' file" % (FASTA))
# retcode = subprocess.call(["pyfasta","split", "-n", nfrags, FASTA]) - Deprecated
fragmentor(FASTA,fastaList,nseq,nfrags)
fls = [file for file in os.listdir() if re.search(r'.*\.frag\.[0-9].*\.fa', file)] ## file list using regex
#fls = glob.glob(r'%s.[0-9]{1-3}.fa' % (FASTA.split('.')[0])) ## fragmented file list ##
#print ('The fragments: %s' % (fls))
else:
nfrags = str(nseq) ##
if fragFasta == 'Y':
print ("--Header based fragmentation in process for '%s' file" % (FASTA))
# retcode = subprocess.call(["pyfasta","split", "-n", nfrags, FASTA]) - Deprecated
fragmentor(FASTA,fastaList,nseq,frags)
fls = [file for file in os.listdir() if re.search(r'.*\.frag\.[0-9].*\.fa', file)]
#print ('The fragments: %s' % (fls))
memFile = open('frag.mem','w')
memFile.write("fasta=%s\n" % (FASTA))
memFile.write("genomeFeature=%s\n" % (str(args.genomeFeature)))
memFile.write("size=%sMB\n" % (filesize))
memFile.write("frags=%s" % (','.join(fls)))
memFile.close()
return fls
def miRinput():
'''
Cleans miRNA fasta file and processes for target prediction
'''
print("\nFn: miRNAProcessor#########################################")
#### Check for file
if not os.path.isfile(args.miRNAFile):
print("'%s' file not found at:%s" % (args.miRNAFile,os.getcwd()))
print("Please check if miRNA fasta file exists in your directory\n")
sys.exit()