diff --git a/README.md b/README.md index d743109..cb6ad66 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ usage: StORF_Reporter.py [-h] [-olap_filt [{none,single-strand,both-strand}]] [-start_filt {True,False}] [-so [{start_pos,strand}]] [-f_type [{StORF,CDS,ORF}]] [-olap OVERLAP_NT] [-ao ALLOWED_OVERLAP] [-overwrite {True,False}] [-verbose {True,False}] [-v] -StORF-Reporter v1.3.4: StORF-Reporter Run Parameters. +StORF-Reporter v1.4.0: StORF-Reporter Run Parameters. Required Options: -anno [{Prokka,Bakta,Out_Dir,Multiple_Out_Dirs,Single_GFF,Multiple_GFFs,Ensembl,Feature_Types,Single_Genome,Multiple_Genomes,Single_Combined_GFF,Multiple_Combined_GFFs,Pyrodigal,Single_FASTA,Multiple_FASTA} ...] @@ -148,6 +148,10 @@ StORF-Finder Options: -f_type [{StORF,CDS,ORF}] Default - "CDS": Which GFF feature type for StORFs to be reported as in GFF - "CDS" is probably needed for use in tools such as Roary and Panaroo -olap OVERLAP_NT Default - 50: Maximum number of nt of a StORF which can overlap another StORF. + -non_standard NON_STANDARD + Default - 0.20: Reject StORFs with >=20% non-standard nucleotides (A,T,G,C) - Provide % as + decimal + -ao ALLOWED_OVERLAP Default - 50 nt: Maximum overlap between a StORF and an original gene. Misc: @@ -174,7 +178,7 @@ usage: StORF_Extractor.py [-h] [-storf_input {Combined,Separate}] [-p PATH] [-gf [-lw {True,False}] [-stop_ident {True,False}] [-oname O_NAME] [-odir O_DIR] [-gz {True,False}] [-verbose {True,False}] [-v] -Single_Genome v1.3.4: StORF-Extractor Run Parameters. +Single_Genome v1.4.0: StORF-Extractor Run Parameters. Required Arguments: -storf_input {Combined,Separate} @@ -210,7 +214,7 @@ usage: StORF_Finder.py [-h] [-f FASTA] [-ua {True,False}] [-wc {True,False}] [-p [-stop_ident {True,False}] [-f_type [{StORF,CDS,ORF}]] [-minorf MIN_ORF] [-maxorf MAX_ORF] [-codons STOP_CODONS] [-olap OVERLAP_NT] [-s SUFFIX] [-so [{start_pos,strand}]] [-spos {True,False}] [-oname O_NAME] [-odir O_DIR] [-gff {True,False}] [-aa {True,False}] [-aa_only {True,False}] [-lw {True,False}] [-gff_fasta {True,False}] [-gz {True,False}] [-verbose {True,False}] [-v] -StORF-Reporter v1.3.4: StORF-Finder Run Parameters. +StORF-Reporter v1.4.0: StORF-Finder Run Parameters. Required Arguments: -f FASTA Input FASTA File - (UR_Extractor output) @@ -240,6 +244,9 @@ Optional Arguments: -maxorf MAX_ORF Default - 60kb: Maximum StORF size in nt -codons STOP_CODONS Default - ('TAG,TGA,TAA'): List Stop Codons to use -olap OVERLAP_NT Default - 50: Maximum number of nt of a StORF which can overlap another StORF. + -non_standard NON_STANDARD + Default - 0.20: Reject StORFs with >=20% non-standard nucleotides (A,T,G,C) - Provide % as + decimal -s SUFFIX Default - Do not append suffix to genome ID -so [{start_pos,strand}] Default - Start Position: How should StORFs be ordered when >1 reported in a single UR. @@ -274,7 +281,7 @@ StORF-Extractor -storf_input Combined -p .../Test_Datasets/Combined_GFFs/E-coli_ ```python usage: StORF_Extractor.py [-h] [-storf_input {Combined,Separate}] [-p PATH] [-gff_out {True,False}] [-oname O_NAME] [-odir O_DIR] [-gz {True,False}] [-verbose {True,False}] [-v] -StORF-Reporter v1.3.4: StORF-Extractor Run Parameters. +StORF-Reporter v1.4.0: StORF-Extractor Run Parameters. Required Arguments: -storf_input {Combined,Separate} @@ -307,7 +314,7 @@ StORF-Remover -gff .../Test_Datasets/StORF_Extractor_And_Remover/Myco_UR_StORF-R usage: StORF_Remover.py [-h] [-gff GFF] [-blast BLAST] [-min_score MINSCORE] [-oname O_NAME] [-odir O_DIR] [-gz {True,False}] [-verbose {True,False}] [-v] -StORF-Reporter v1.3.4: UR-Remover Run Parameters. +StORF-Reporter v1.4.0: UR-Remover Run Parameters. Required Arguments: -gff GFF GFF annotation file for the FASTA diff --git a/setup.cfg b/setup.cfg index 440d007..09b00b5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = StORF-Reporter -version = v1.3.4 +version = v1.4.0 author = Nicholas Dimonaco author_email = nicholas@dimonaco.co.uk description = StORF-Reporter - A a tool that takes an annotated genome and returns missing CDS genes (Stop-to-Stop) from unannotated regions. diff --git a/src/StORF_Reporter/Constants.py b/src/StORF_Reporter/Constants.py index 6a70d68..d1ac824 100644 --- a/src/StORF_Reporter/Constants.py +++ b/src/StORF_Reporter/Constants.py @@ -1 +1 @@ -StORF_Reporter_Version = 'v1.3.4' +StORF_Reporter_Version = 'v1.4.0' diff --git a/src/StORF_Reporter/StORF_Finder.py b/src/StORF_Reporter/StORF_Finder.py index eb0ae70..ed56658 100755 --- a/src/StORF_Reporter/StORF_Finder.py +++ b/src/StORF_Reporter/StORF_Finder.py @@ -7,6 +7,8 @@ import gzip import os import sys +from line_profiler_pycharm import profile + try: from ORForise.utils import sortORFs # Calling from ORForise via pip @@ -42,18 +44,31 @@ def translate_frame(sequence): translate = ''.join([gencode.get(sequence[3 * i:3 * i + 3], 'X') for i in range(len(sequence) // 3)]) return translate + + +############################ +def check_non_standard_chars(options,storf_seq): + non_standard_set = set(['A', 'T', 'G', 'C']) + total_chars = len(storf_seq) + non_standard_count = sum(1 for char in storf_seq if char not in non_standard_set) + non_standard_percentage = non_standard_count / total_chars + if non_standard_percentage > float(options.non_standard): + return False + else: + return True + + ############################ -def reverseCorrectLoci(sequence_id,first,second,third): # here for the negative loci correction - ur_length = int(sequence_id.split('_')[-1]) - int(sequence_id.split('_')[-2]) +def reverseCorrectLoci(options,sequence_length,sequence_id,first,second,third): # here for the negative loci correction if second == None: - corrected_start = max(ur_length - int(third-3),1) - corrected_stop = max(ur_length - int(first-3),1) + corrected_start = max(sequence_length - int(third-3),1) + corrected_stop = max(sequence_length - int(first-3),1) return corrected_start, corrected_stop else: - corrected_start = max(ur_length - int(third-3),1) - corrected_mid = max(ur_length - int(second-3),1) - corrected_stop = max(ur_length - int(first-3),1) + corrected_start = max(sequence_length - int(third-3),1) + corrected_mid = max(sequence_length - int(second-3),1) + corrected_stop = max(sequence_length - int(first-3),1) return corrected_start, corrected_mid, corrected_stop @@ -106,8 +121,8 @@ def start_filtering(storfs): break return keep_storfs +@profile def tile_filtering(storfs,options): #both-strand filtering - #storfs = OrderedDict(sorted(storfs.items(), key=lambda e: tuple(map(int, e[0].split(","))))) # Is this needed if I am reordering below? ################ - Order largest first filtering storfs = sorted(storfs.items(), key=lambda storfs:storfs[1][3],reverse=True) ordered_by_length = OrderedDict() @@ -119,30 +134,37 @@ def tile_filtering(storfs,options): #both-strand filtering ordered_by_length = list(ordered_by_length.items()) while i < num_storfs: pos_x, data_x = ordered_by_length[i] - start_x = int(pos_x.split(',')[0]) - stop_x = int(pos_x.split(',')[-1]) - j = i+1 - while j < num_storfs: - pos_y, data_y = ordered_by_length[j] - start_y = int(pos_y.split(',')[0]) - stop_y = int(pos_y.split(',')[-1]) - if start_y >= stop_x or stop_y <= start_x: - j+=1 - continue # Not caught up yet / too far - elif start_y >= start_x and stop_y <= stop_x: - ordered_by_length.pop(j) - num_storfs = len(ordered_by_length) - else: # +1 needed for stop codon - x = set(range(start_x,stop_x+1)) - y = set(range(start_y,stop_y+1)) - overlap = len(x.intersection(y)) - if overlap >= options.overlap_nt: + j = i + 1 + if check_non_standard_chars(options,data_x[0]): + start_x = int(pos_x.split(',')[0]) + stop_x = int(pos_x.split(',')[-1]) + while j < num_storfs: + pos_y, data_y = ordered_by_length[j] + start_y = int(pos_y.split(',')[0]) + stop_y = int(pos_y.split(',')[-1]) + if start_y >= stop_x or stop_y <= start_x: + j+=1 + continue # Not caught up yet / too far + elif start_y >= start_x and stop_y <= stop_x: + # StORF Y is completely contained within StORF X, remove Y ordered_by_length.pop(j) - num_storfs = len(ordered_by_length) - else: - j += 1 - num_storfs = len(ordered_by_length) - i+=1 + num_storfs -= 1 + else: # +1 needed for stop codon + overlap_start = max(start_x, start_y) # The start of the overlapping region + overlap_end = min(stop_x, stop_y) # The end of the overlapping region + # Calculate the length of the overlap + overlap = max(0, overlap_end - overlap_start + 1) # Ensure overlap isn't negative + if overlap >= options.overlap_nt: + ordered_by_length.pop(j) + num_storfs -= 1 + else: + j += 1 + num_storfs -= 1 + i+=1 + else: # Remove StORF if proportion of X's more than allowed + ordered_by_length.pop(i) + num_storfs = len(ordered_by_length) + filtered_storfs = OrderedDict(ordered_by_length) #### Clunky - reordering of StORFs - Through number found (pos then neg strand) or start position? if options.storf_order == 'start_pos': @@ -214,15 +236,19 @@ def prepare_out(options, storfs, seq_id): ';Start_Stop=' + start_stop + ';End_Stop=' + end_stop + ';StORF_Type=' + storf_Type + '\n':data[0]}) ######################################################################### - elif options.intergenic == False: # Not done yet - fa_id = (">" + str(storf_name) + "|" + str(start) + strand + str(stop) + "|Frame:" + str( - frame) + '|Start_Stop=' + start_stop + '|Mid_Stop=' + mid_stop + - '|End_Stop=' + end_stop + '|StORF_Type:' + storf_Type + "\n") - gff_entries.append( native_seq + '\tStORF_Reporter\t' + options.feature_type + '\t' + gff_start + '\t' + gff_stop + '\t.\t' + data[2] + - '\t.\tID=' + storf_name + ';UR=' + ur_name + ';UR_Stop_Locations=' + '-'.join(pos_) + ';Length=' + str( - length) + - ';Frame=' + str(frame) + ';UR_Frame=' + str(ur_frame) + - ';Start_Stop=' + start_stop + ';End_Stop=' + end_stop + ';StORF_Type=' + storf_Type + '\n') + elif options.unannotated == False: # Not done yet + if strand == '+': + frame = (int(stop) % 3) + 4 + elif strand == '-': + frame = (int(stop) % 3) + 1 + storf_name = native_seq + '_' + storf_Type + '_' + str(idx) + ':' + str(start) + '-' + str(stop) + + gff_entries.append(native_seq + '\tStORF_Reporter\t' + options.feature_type + '\t' + str(start) + '\t' + str(stop) + '\t.\t' + data[2] + + '\t.\tID=' + storf_name + ';=' + native_seq + ';UR_Stop_Locations=' + '-'.join(pos_) + ';Length=' + str(length) + + ';Frame=' + str(frame) + ';Start_Stop=' + start_stop + ';End_Stop=' + end_stop + ';StORF_Type=' + storf_Type + '\n') + + fasta_entries.update({'>' + native_seq + ';Length=' + str(length) + ';Strand=' + data[2] + ';Frame=' + str(frame) + + ';Start_Stop=' + start_stop + ';End_Stop=' + end_stop + ';StORF_Type=' + storf_Type + '\n':data[0]}) return gff_entries, fasta_entries @@ -295,7 +321,7 @@ def find_storfs(working_frame,sequence_id,stops,sequence,storfs,short_storfs,con length = next_stop - prev_stop ##### Needed to correct for negative frame loci if working_frame == 'negative': - rev_corrected_start, rev_corrected_mid, rev_corrected_stop = reverseCorrectLoci(sequence_id,prev_stop,stop,next_stop + 3) + rev_corrected_start, rev_corrected_mid, rev_corrected_stop = reverseCorrectLoci(options,len(sequence),len(sequence),sequence_id,prev_stop,stop,next_stop + 3) con_StORF_Pos = ",".join([str(rev_corrected_start), str(rev_corrected_mid), str(rev_corrected_stop)]) else: con_StORF_Pos = ",".join([str(prev_stop), str(stop), str(next_stop + 3)]) @@ -313,7 +339,7 @@ def find_storfs(working_frame,sequence_id,stops,sequence,storfs,short_storfs,con prev_con_StORF = prev_con_StORF_CHECKER(prev_con_StORF,sequence,options) ##### Needed to correct for negative frame loci if working_frame == 'negative': - rev_corrected_start, rev_corrected_mid, rev_corrected_stop = reverseCorrectLoci(sequence_id,prev_stop,stop,next_stop + 3) + rev_corrected_start, rev_corrected_mid, rev_corrected_stop = reverseCorrectLoci(options,len(sequence),sequence_id,prev_stop,stop,next_stop + 3) con_StORF_Pos = ",".join([str(rev_corrected_start), str(rev_corrected_mid), str(rev_corrected_stop)]) else: con_StORF_Pos = ",".join([str(prev_stop), str(stop), str(next_stop + 3)]) @@ -328,7 +354,7 @@ def find_storfs(working_frame,sequence_id,stops,sequence,storfs,short_storfs,con seq = sequence[stop:next_stop + 3] ##### Needed to correct for negative frame loci if working_frame == 'negative': - rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(sequence_id,stop,None,next_stop + 3) + rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(options,len(sequence),sequence_id,stop,None,next_stop + 3) storfs.update({",".join([str(rev_corrected_start), str(rev_corrected_stop)]): [seq, str(frame), strand, length, 'StORF', StORF_idx]}) else: storfs.update({",".join([str(stop), str(next_stop + 3)]): [seq, str(frame), strand, length,'StORF',StORF_idx]}) @@ -344,7 +370,7 @@ def find_storfs(working_frame,sequence_id,stops,sequence,storfs,short_storfs,con seq = sequence[stop:next_stop + 3] ##### Needed to correct for negative frame loci if working_frame == 'negative': - rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(sequence_id,stop,None,next_stop + 3) + rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(options,len(sequence),sequence_id,stop,None,next_stop + 3) storfs.update({",".join([str(rev_corrected_start), str(rev_corrected_stop)]): [ seq, str(frame), strand, length, 'StORF', StORF_idx]}) else: @@ -366,7 +392,7 @@ def find_storfs(working_frame,sequence_id,stops,sequence,storfs,short_storfs,con seq = sequence[stop:next_stop + 3] ##### Needed to correct for negative frame loci if working_frame == 'negative': - rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(sequence_id,stop,None,next_stop + 3) + rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(options,len(sequence),sequence_id,stop,None,next_stop + 3) storfs.update({",".join( [str(rev_corrected_start), str(rev_corrected_stop)]): [seq, str(frame), strand, length, @@ -389,7 +415,7 @@ def find_storfs(working_frame,sequence_id,stops,sequence,storfs,short_storfs,con seq = sequence[stop:next_stop + 3] ##### Needed to correct for negative frame loci if working_frame == 'negative': - rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(sequence_id,stop,None,next_stop + 3) + rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(options,len(sequence),sequence_id,stop,None,next_stop + 3) storfs.update({",".join([str(rev_corrected_start), str(rev_corrected_stop)]): [ seq, str(frame), strand, length, 'StORF', StORF_idx]}) else: @@ -412,7 +438,7 @@ def find_storfs(working_frame,sequence_id,stops,sequence,storfs,short_storfs,con #ps_seq = cut_seq(seq, '-') ##### Needed to correct for negative frame loci if working_frame == 'negative': - rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(sequence_id,stop,None,next_stop + 3) + rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(options,len(sequence),sequence_id,stop,None,next_stop + 3) storfs.update({",".join( [str(rev_corrected_start), str(rev_corrected_stop)]): [seq, str(frame), strand, length, @@ -428,7 +454,7 @@ def find_storfs(working_frame,sequence_id,stops,sequence,storfs,short_storfs,con length = next_stop - stop ##### Needed to correct for negative frame loci if working_frame == 'negative': - rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(sequence_id,stop,None,next_stop + 3) + rev_corrected_start, rev_corrected_stop = reverseCorrectLoci(options,len(sequence),sequence_id,stop,None,next_stop + 3) storfs.update({",".join([str(rev_corrected_start), str(rev_corrected_stop)]): [seq, str(frame), strand, length, 'StORF', StORF_idx]}) else: storfs.update({",".join([str(stop), str(next_stop + 3)]): [seq, str(frame), strand, length,'StORF',StORF_idx]}) @@ -657,7 +683,7 @@ def main(): parser._action_groups.pop() required = parser.add_argument_group('Required Arguments') - required.add_argument('-f', action="store", dest='fasta', required=False, + required.add_argument('-f', action="store", dest='fasta', required=True, help='Input FASTA File - (UR_Extractor output)') optional = parser.add_argument_group('Optional Arguments') @@ -694,6 +720,8 @@ def main(): help='Default - 60kb: Maximum StORF size in nt') optional.add_argument('-codons', action="store", dest='stop_codons', default="TAG,TGA,TAA", help='Default - (\'TAG,TGA,TAA\'): List Stop Codons to use') + optional.add_argument('-non_standard', action="store", dest='non_standard', default="0.20", + help='Default - 0.20: Reject StORFs with >=20%% non-standard nucleotides (A,T,G,C) - Provide %% as decimal') optional.add_argument('-olap', action="store", dest='overlap_nt', default=50, type=int, help='Default - 50: Maximum number of nt of a StORF which can overlap another StORF.') optional.add_argument('-s', action="store", dest='suffix', required=False, diff --git a/src/StORF_Reporter/StORF_Reporter.py b/src/StORF_Reporter/StORF_Reporter.py index 5d82bd9..fcc7e81 100644 --- a/src/StORF_Reporter/StORF_Reporter.py +++ b/src/StORF_Reporter/StORF_Reporter.py @@ -294,7 +294,12 @@ def pyrodigal_predict(fasta,Reporter_options): if Reporter_options.py_train == 'meta': orf_finder = pyrodigal.GeneFinder(meta=True) elif Reporter_options.py_train == 'longest': - orf_finder.train(longest) + try: + orf_finder.train(longest) + except ValueError: + if Reporter_options.verbose == True: + print("Longest sequence is <=20kb. Pyrodigal will be run in meta mode") + orf_finder = pyrodigal.GeneFinder(meta=True) # If NO contig is >= 20kb# for sequence_name, seq in sequences.items(): if Reporter_options.py_train == 'indvidual': try: # Catch if sequence is less than 20kb - will run in meta mode @@ -344,10 +349,12 @@ def run_UR_Extractor_Extended_GFFs(Reporter_options,gff): # When given a directo def run_UR_Extractor_Matched(Reporter_options,gff): # When given a directory with multiple GFFs but with accompianing .fna if '_StORF-Reporter_Extended' not in str(gff): #Might fall over - put a break gff = str(gff) + last_pos = gff.rfind('.') + before_last_pos = gff[:last_pos] Reporter_options.gff = gff - fasta = gff.replace('.gff','.fasta') - fna = gff.replace('.gff', '.fna') - fa = gff.replace('.gff', '.fa') + fasta = before_last_pos + '.fasta' + fna = before_last_pos + '.fna' + fa = before_last_pos + '.fa' if os.path.isfile(fasta): Reporter_options.fasta = fasta elif os.path.isfile(fna): @@ -738,6 +745,8 @@ def main(): choices=['StORF', 'CDS', 'ORF'], help='Default - "CDS": Which GFF feature type for StORFs to be reported as in GFF - ' '"CDS" is probably needed for use in tools such as Roary and Panaroo') + StORF_Finder_args.add_argument('-non_standard', action="store", dest='non_standard', default="0.20", + help='Default - 0.20: Reject StORFs with >=20%% non-standard nucleotides (A,T,G,C) - Provide %% as decimal') StORF_Finder_args.add_argument('-olap', action="store", dest='overlap_nt', default=50, type=int, help='Default - 50: Maximum number of nt of a StORF which can overlap another StORF.') StORF_Finder_args.add_argument('-ao', action="store", dest='allowed_overlap', default=50, type=int, diff --git a/src/StORF_Reporter/UR_Extractor.py b/src/StORF_Reporter/UR_Extractor.py index 2047abe..6839181 100755 --- a/src/StORF_Reporter/UR_Extractor.py +++ b/src/StORF_Reporter/UR_Extractor.py @@ -201,10 +201,10 @@ def main(): parser._action_groups.pop() required = parser.add_argument_group('Required Arguments') - required.add_argument('-f', action='store', dest='fasta', required=False, + required.add_argument('-f', action='store', dest='fasta', required=True, help='FASTA file for Unannotated Region seq extraction') required.add_argument('-gff', action='store', dest='gff', help='GFF annotation file for the FASTA', - required=False) + required=True) optional = parser.add_argument_group('Optional Arguments') optional.add_argument('-ident', action='store', dest='ident', default='_UR',