Skip to content

Commit

Permalink
Merge pull request #47 from NBISweden/feature/1.3
Browse files Browse the repository at this point in the history
Feature/1.3
  • Loading branch information
Juke34 authored Oct 18, 2019
2 parents 789f25f + 323de02 commit a181199
Show file tree
Hide file tree
Showing 4 changed files with 943 additions and 190 deletions.
63 changes: 32 additions & 31 deletions EMBLmyGFF3/EMBLmyGFF3.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def __init__(self, record = None, verify = False, guess = True):
self.translate = False

#================== def methods =======================
#\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/
#\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/

def _add_mandatory(self):
"""
Expand All @@ -180,7 +180,7 @@ def _add_mandatory(self):
source_feature.qualifiers["isolation_source"] = EMBL.PREVIOUS_VALUES["isolation_source"]
if "isolate" in EMBL.PREVIOUS_VALUES:
source_feature.qualifiers["isolate"] = EMBL.PREVIOUS_VALUES["isolate"]

source_feature.sub_features = [] # important to define that there is no sub_features
self.record.features[0:0] = [source_feature]

# Make sure that there's a gap feature for every span of n's
Expand All @@ -205,13 +205,14 @@ def _add_mandatory(self):
gap_location = FeatureLocation(ExactPosition(start), ExactPosition(end))
gap_location.strand = 1
gap_feature = SeqFeature( gap_location )
gap_feature.sub_features = [] # important to define that there is no sub_features
gap_feature.qualifiers["estimated_length"] = end-start
gap_feature.type = "gap"
self.record.features += [gap_feature]
if EMBL.total_features:
EMBL.total_features += 1
# Move +1 to start back from outside the gap
end +=1
end +=1
start = seq.index('n',end)

except ValueError as e:
Expand Down Expand Up @@ -285,10 +286,10 @@ def _verify_locus_tag(self, locus_tag):
"""
checked_locus_tag=None
while not checked_locus_tag:
if not locus_tag:

if not locus_tag:
sys.stderr.write("No value provided as locus_tag.\nPlease provide a locus_tag (A default XXX locus_tag will be set up if none provided):")
locus_tag = raw_input()
locus_tag = raw_input()
if not locus_tag:
checked_locus_tag="XXX"
break
Expand All @@ -299,9 +300,9 @@ def _verify_locus_tag(self, locus_tag):

# if starts with a letter
if locus_tag[:1].isalpha():
# if contains only alpha-numeric characters
# if contains only alpha-numeric characters
if locus_tag.isalnum():
# if more than 3 characters
# if more than 3 characters
if len(locus_tag) >= 3:
checked_locus_tag = locus_tag # passes all checks, save the value !
else:
Expand Down Expand Up @@ -350,7 +351,7 @@ def get_taxid_from_species(self, species):
search = Entrez.esearch(term=species, db="taxonomy", retmode="xml")
record = Entrez.read(search)
if not record['IdList']: #no taxid found
logging.error("Please verify the species name. '%s' species is unknown into the NCBI taxonomy databse. Impossible to check the taxonomic classification. We will use the default value 'Life' to populate the OC line.",self.species)
logging.warning("Please verify the species name. '%s' species is unknown into the NCBI taxonomy databse. Impossible to check the taxonomic classification. We will use the default value 'Life' to populate the OC line.",self.species)
taxid=None
else:
taxid = record['IdList'][0]
Expand Down Expand Up @@ -391,24 +392,24 @@ def print_progress(clear = False):
print_overwritable( progress )

def handle_message(self, type, msg_type, msg, value):

if EMBL.PREVIOUS_ERRORS.has_key(msg_type):
EMBL.PREVIOUS_ERRORS[msg_type] += 1

level = eval("logging.%s" % type.upper())

if self.uncompressed_log:
logging.log(level, msg)
else:
if not value: # number of line accepted to display (defaut or given to the method)
value = 5
value = 5
if not EMBL.PREVIOUS_ERRORS.has_key(msg_type) or EMBL.PREVIOUS_ERRORS[msg_type] < value:
logging.log(level, msg)
EMBL.PREVIOUS_ERRORS.setdefault(msg_type,1)
elif EMBL.PREVIOUS_ERRORS[msg_type] == value:
logging.log(level, msg)
final_message = 'We will not display anymore this %s. Please use the --uncompressed_log parameter if you wish having all of them.' % type
logging.log(level, final_message)
logging.log(level, final_message)

#================== def EMBL line =======================
#\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/
Expand Down Expand Up @@ -679,12 +680,12 @@ def FT(self):

####################
# manage locus_tag #

locus_tag=None
#skip feature without locus_tag
if feature.type.lower() != "source" and feature.type.lower() != "gap":

# When option attribute_to_use_as_locus_tag activated, use the value of an attribute defined by the user
# When option attribute_to_use_as_locus_tag activated, use the value of an attribute defined by the user
if self.PREVIOUS_VALUES['attribute_to_use_as_locus_tag']: # If we have defined an attribute to use as locus_tag
attribute = self.PREVIOUS_VALUES['attribute_to_use_as_locus_tag']
for qualifier in feature.qualifiers:
Expand All @@ -694,7 +695,7 @@ def FT(self):
if not locus_tag: #inform the user that we will use the locus_tag instead
msg_type = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist" % (attribute)
msg = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist for feature %s. "\
"Consequently I will use the locus_tag %s to create a proper one." % (attribute, feature.id, self.locus_tag)
"Consequently I will use the locus_tag %s to create a proper one." % (attribute, feature.id, self.locus_tag)
self.handle_message("warning", msg_type, msg, None)
# create a locus tag base on the prefix + LOCUS + incremented number
if not locus_tag:
Expand All @@ -712,8 +713,8 @@ def FT(self):
# create locus tag from locus_tag_suffix and accession
locus_tag = "%s_%s" % (locus_tag_prefix, locus_tag_suffix)

f = Feature(feature, self.record.seq, locus_tag, self.transl_table, translate=self.translate,
feature_definition_dir=FEATURE_DIR, qualifier_definition_dir=QUALIFIER_DIR, level=1,
f = Feature(feature, self.record.seq, locus_tag, self.transl_table, translate=self.translate,
feature_definition_dir=FEATURE_DIR, qualifier_definition_dir=QUALIFIER_DIR, level=1,
reorder_gene_features = self.interleave_genes, force_unknown_features = self.force_unknown_features,
force_uncomplete_features = self.force_uncomplete_features, uncompressed_log = self.uncompressed_log, no_wrap_qualifier = self.no_wrap_qualifier)

Expand Down Expand Up @@ -796,7 +797,7 @@ def SQ(self, out = None):
lenSeq = len(seq)
while seqBuilt_len < lenSeq :
slice_seq = seq[seqBuilt_len:(seqBuilt_len+60)]
current_line = " ".join([slice_seq[i*10:(i+1)*10] for i in range(0, 6)])
current_line = " ".join([slice_seq[i*10:(i+1)*10] for i in range(0, 6)])
seqBuilt_len += len(slice_seq)
formatted_line = "\n %s %s" % ("{:65}".format(current_line), "{:>9}".format(str(seqBuilt_len)))

Expand Down Expand Up @@ -860,7 +861,7 @@ def set_classification(self, classification = None, strain = None, environmental
"the /environmental_sample qualifier should also contain the /isolation_source qualifier. entries including "\
"/environmental_sample must not include the /strain qualifier)\nStrain:")
strain = raw_input()
if strain:
if strain:
EMBL.PREVIOUS_VALUES["strain"]=strain
onekey = strain
if not strain: #Entry with /environmental_sample must not include the /strain qualifier
Expand Down Expand Up @@ -990,7 +991,7 @@ def set_keywords(self, keywords = []):
self.keywords = EMBL.PREVIOUS_VALUES["keywords"]
else:
if not keywords:
keywords = [""]
keywords = [""]
EMBL.PREVIOUS_VALUES["keywords"] = keywords
self.keywords = keywords

Expand Down Expand Up @@ -1053,7 +1054,7 @@ def set_organelle(self, organelle = None):
else:
if self.verify and organelle:
organelle = self._verify( self.organelle, "organelle")

self.organelle = organelle
EMBL.PREVIOUS_VALUES["organelle"] = organelle

Expand Down Expand Up @@ -1113,7 +1114,7 @@ def set_taxonomy(self, taxonomy = None):
taxonomy = self._verify( taxonomy, "taxonomy")
else:
taxonomy = "XXX"

self.taxonomy = taxonomy
EMBL.PREVIOUS_VALUES["taxonomy"] = taxonomy

Expand All @@ -1125,7 +1126,7 @@ def set_topology(self, topology = None):
self.topology = EMBL.PREVIOUS_VALUES["topology"]
else:
if self.verify:
topology = self._verify( topology, "topology")
topology = self._verify( topology, "topology")

self.topology = topology
EMBL.PREVIOUS_VALUES["topology"] = topology
Expand All @@ -1136,7 +1137,7 @@ def set_transl_table(self, transl_table = None):
"""
if "transl_table" in EMBL.PREVIOUS_VALUES:
self.transl_table = EMBL.PREVIOUS_VALUES["transl_table"]
else:
else:
if self.verify:
transl_table = self._verify( transl_table, "transl_table")

Expand All @@ -1161,7 +1162,7 @@ def set_version(self, version = None):
"""
if "version" in EMBL.PREVIOUS_VALUES:
self.version = EMBL.PREVIOUS_VALUES["version"]
else:
else:
if version:
version = "SV %i" % version
else:
Expand Down Expand Up @@ -1299,7 +1300,7 @@ def main():
args = parser.parse_args()

# add a convenience argument. This is because I want the argument to be called
# progress in the code, but --no_progress makes more sense as an argument to
# progress in the code, but --no_progress makes more sense as an argument to
# hide the progress bar.
args.progress = args.no_progress

Expand Down Expand Up @@ -1344,7 +1345,7 @@ def main():
infile.seek(0, 0)

for record in GFF.parse(infile, base_dict=seq_dict):

# Check existence of gff seqid among the fasta sequence identifiers
if record.id not in seq_dict:
logging.warning("Sequence id <%s> from the gff file not found within the fasta file. Are you sure to provide the correct" \
Expand Down Expand Up @@ -1376,7 +1377,7 @@ def main():
writer.set_attribute_to_use_as_locus_tag( args.use_attribute_value_as_locus_tag ) #has to be before set_locus_tag
writer.set_locus_tag( args.locus_tag )
writer.set_locus_numbering_start(args.locus_numbering_start)

writer.set_molecule_type( args.molecule_type )
writer.set_no_wrap_qualifier( args.no_wrap_qualifier)
writer.set_organelle( args.organelle )
Expand All @@ -1387,15 +1388,15 @@ def main():
writer.set_translation(args.translate)
writer.set_uncompressed_log(args.uncompressed_log)
writer.set_version( args.version )


writer.add_reference(args.rt, location = args.rl, comment = args.rc, xrefs = args.rx, group = args.rg, authors = args.ra)

write_start = time.time()
writer.write_all( outfile )

writer = None
if args.progress:
if args.progress:
EMBL.print_progress(True)

sys.stderr.write( """Conversion done\n""")
34 changes: 17 additions & 17 deletions EMBLmyGFF3/modules/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class Feature(object):

def __init__(self, feature = None, seq = None, accessions = [], transl_table = 1, translation_files = [], translate = False,
feature_definition_dir = DEFAULT_FEATURE_DIR, qualifier_definition_dir = DEFAULT_QUALIFIER_DIR, format_data = True,
level = 0, reorder_gene_features = True, skip_feature = False, force_unknown_features = False,
level = 0, reorder_gene_features = True, skip_feature = False, force_unknown_features = False,
force_uncomplete_features = False, uncompressed_log = None, no_wrap_qualifier = False):
"""
Initializes a Feature, loads json files for feature and
Expand All @@ -90,7 +90,7 @@ def __init__(self, feature = None, seq = None, accessions = [], transl_table = 1
self.reorder_gene_features = reorder_gene_features
self.remove = []
self.seq = seq
self.singleton_types = ["exon"]
self.combine_types = ["CDS","3'UTR","5'UTR"]
self.skip_feature = skip_feature
self.sub_features = []
self.translate = translate
Expand All @@ -105,15 +105,15 @@ def __init__(self, feature = None, seq = None, accessions = [], transl_table = 1
self._load_data(feature, accessions)
self._check_qualifier(feature)

if level == 1:
if level == 1 :
# Parse through subfeatures level2
featureObj_level2 = None
for feature_l2 in feature.sub_features:
featureObj_level2 = Feature(feature_l2, self.seq, accessions, self.transl_table, self.translation_files, self.translate,
self.feature_definition_dir, self.qualifier_definition_dir, format_data = True, level=2)
self.sub_features += [featureObj_level2]


# Sort the sub-features in case they were not ordered into the gff3 file (issue 1)
feature_l2.sub_features.sort(key=lambda x: x.location.start)

Expand All @@ -122,7 +122,7 @@ def __init__(self, feature = None, seq = None, accessions = [], transl_table = 1
for feature_l3 in feature_l2.sub_features:
l3_type = self._from_gff_feature(feature_l3.type)
l2_sub_features = [sf.type for sf in featureObj_level2.sub_features]
if l3_type in l2_sub_features and l3_type not in self.singleton_types:
if l3_type in l2_sub_features and l3_type in self.combine_types:
old_feature = [sf for sf in featureObj_level2.sub_features if sf.type == self._from_gff_feature(feature_l3.type)][0]
old_feature.combine(feature_l3)
else:
Expand Down Expand Up @@ -251,15 +251,15 @@ def _infer_ORFs(self, feature):

# basic info
strand = self.location.strand
# raise an error if no strand for the CDS. Strand is not mandatory (can be a dot) except for CDS where it has an
# raise an error if no strand for the CDS. Strand is not mandatory (can be a dot) except for CDS where it has an
# impact on the translation, and to check where is start and stop codon...
if strand == None:
ID=''
for qualifier in self.feature.qualifiers:
if 'id' == qualifier.lower():
ID = "%s" % " ".join(self.feature.qualifiers[qualifier])
break
logging.error('CDS %s does not have any strand! Please check your gff file.' % ID)
logging.error('CDS %s does not have any strand! Please check your gff file.' % ID)
sys.exit()

if start_codon.upper() not in codon_table.start_codons:
Expand All @@ -278,7 +278,7 @@ def _check_qualifier(self, feature):
# Check presence of mandatory qualifier
if self.qualifiers[qualifier].mandatory:# Check if the qualifier is mandatory
if not self.qualifiers[qualifier].value: # No value for this mandatory qualifier

msg = "The qualifier >%s< is mandatory for the feature >%s<. We will not report the feature." % (qualifier, self.type)
self.handle_message('warning', msg, msg, None)

Expand Down Expand Up @@ -462,7 +462,7 @@ def add_qualifier(self, gff_qualifier, value):
error_regex = True

if error_regex:

msg_type = "The value(s) is(are) invalid for the qualifier %s of the feature %s. We will not report the qualifier. (Here is the regex expected: %s)" % (qualifier, self.type, regex)
msg = "The value(s) %s is(are) invalid for the qualifier %s of the feature %s. We will not report the qualifier. (Here is the regex expected: %s)" % (value, qualifier, self.type, regex)
self.handle_message("warning", msg_type, msg, None)
Expand All @@ -479,7 +479,7 @@ def add_qualifier(self, gff_qualifier, value):
value = ["%s%s" % (v, self.qualifier_suffix[gff_qualifier]) for v in value]

###########################################
# add the value only if not already present
# add the value only if not already present
# List case
if isinstance(value, list):
for val in value:
Expand Down Expand Up @@ -580,7 +580,7 @@ def translation(self):
U = "Sec"; Selenocysteine
O = "Pyl"; Pyrrolysine
"""
codon_table = CodonTable.ambiguous_dna_by_id[self.transl_table]
codon_table = CodonTable.ambiguous_dna_by_id[self.transl_table]
seq = Seq(str(self.sequence()),IUPACAmbiguousDNA())

#start translation according to the phase. Phase and codon_start are not the same coordinate system. It is why we have to remove 1
Expand All @@ -590,7 +590,7 @@ def translation(self):

#check if multiple of three
remaining_nuc = len(seq)%3
if remaining_nuc != 0:
if remaining_nuc != 0:
#create warning
ID=''
for qualifier in self.feature.qualifiers:
Expand All @@ -601,7 +601,7 @@ def translation(self):

#translate the sequence in AA with normal frame even if stop inside
translated_seq = seq.translate(codon_table).tostring().replace('B','X').replace('Z','X').replace('J','X')

#Extra check about stop codon in CDS
if '*' in translated_seq[:-1]: # check if premature stop codon in the translation
ID=''
Expand All @@ -618,24 +618,24 @@ def translation(self):
return translated_seq

def handle_message(self, type, msg_type, msg, value):

if Feature.PREVIOUS_ERRORS.has_key(msg_type):
Feature.PREVIOUS_ERRORS[msg_type] += 1

level = eval("logging.%s" % type.upper())

if self.uncompressed_log:
logging.log(level, msg)
else:
if not value: # number of line accepted to display (defaut or given to the method)
value = 5
value = 5
if not Feature.PREVIOUS_ERRORS.has_key(msg_type) or Feature.PREVIOUS_ERRORS[msg_type] < value:
logging.log(level, msg)
Feature.PREVIOUS_ERRORS.setdefault(msg_type,1)
elif Feature.PREVIOUS_ERRORS[msg_type] == value:
logging.log(level, msg)
final_message = 'We will not display anymore this %s. Please use the --uncompressed_log parameter if you wish having all of them.' % type
logging.log(level, final_message)
logging.log(level, final_message)


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit a181199

Please sign in to comment.