Merge pull request #47 from NBISweden/feature/1.3

Feature/1.3
NBISweden · Oct 18, 2019 · a181199 · a181199
2 parents 789f25f + 323de02
commit a181199
Show file tree

Hide file tree

Showing 4 changed files with 943 additions and 190 deletions.
diff --git a/EMBLmyGFF3/EMBLmyGFF3.py b/EMBLmyGFF3/EMBLmyGFF3.py
@@ -154,7 +154,7 @@ def __init__(self, record = None, verify = False, guess = True):
         self.translate = False
 
 #================== def methods =======================
-#\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/        
+#\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/
 
     def _add_mandatory(self):
         """
@@ -180,7 +180,7 @@ def _add_mandatory(self):
                 source_feature.qualifiers["isolation_source"] = EMBL.PREVIOUS_VALUES["isolation_source"]
             if "isolate" in  EMBL.PREVIOUS_VALUES:
                 source_feature.qualifiers["isolate"] = EMBL.PREVIOUS_VALUES["isolate"]
-
+            source_feature.sub_features = [] # important to define that there is no sub_features
             self.record.features[0:0] = [source_feature]
 
         # Make sure that there's a gap feature for every span of n's
@@ -205,13 +205,14 @@ def _add_mandatory(self):
                     gap_location = FeatureLocation(ExactPosition(start), ExactPosition(end))
                     gap_location.strand = 1
                     gap_feature = SeqFeature( gap_location )
+                    gap_feature.sub_features = [] # important to define that there is no sub_features
                     gap_feature.qualifiers["estimated_length"] = end-start
                     gap_feature.type = "gap"
                     self.record.features += [gap_feature]
                     if EMBL.total_features:
                         EMBL.total_features += 1
                 # Move +1 to start back from outside the gap
-                end +=1 
+                end +=1
                 start = seq.index('n',end)
 
         except ValueError as e:
@@ -285,10 +286,10 @@ def _verify_locus_tag(self, locus_tag):
         """
         checked_locus_tag=None
         while not checked_locus_tag:
-            
-            if not locus_tag: 
+
+            if not locus_tag:
                 sys.stderr.write("No value provided as locus_tag.\nPlease provide a locus_tag (A default XXX locus_tag will be set up if none provided):")
-                locus_tag = raw_input()      
+                locus_tag = raw_input()
                 if not locus_tag:
                     checked_locus_tag="XXX"
                     break
@@ -299,9 +300,9 @@ def _verify_locus_tag(self, locus_tag):
 
             # if starts with a letter
             if locus_tag[:1].isalpha():
-                # if contains only alpha-numeric characters 
+                # if contains only alpha-numeric characters
                 if locus_tag.isalnum():
-                    # if more than 3 characters    
+                    # if more than 3 characters
                     if len(locus_tag) >= 3:
                         checked_locus_tag = locus_tag # passes all checks, save the value !
                     else:
@@ -350,7 +351,7 @@ def get_taxid_from_species(self, species):
                 search = Entrez.esearch(term=species, db="taxonomy", retmode="xml")
                 record = Entrez.read(search)
                 if not record['IdList']: #no taxid found
-                    logging.error("Please verify the species name. '%s' species is unknown into the NCBI taxonomy databse. Impossible to check the taxonomic classification. We will use the default value 'Life' to populate the OC line.",self.species)
+                    logging.warning("Please verify the species name. '%s' species is unknown into the NCBI taxonomy databse. Impossible to check the taxonomic classification. We will use the default value 'Life' to populate the OC line.",self.species)
                     taxid=None
                 else:
                     taxid = record['IdList'][0]
@@ -391,24 +392,24 @@ def print_progress(clear = False):
             print_overwritable( progress )
 
     def handle_message(self, type, msg_type, msg, value):
-    
+
         if EMBL.PREVIOUS_ERRORS.has_key(msg_type):
             EMBL.PREVIOUS_ERRORS[msg_type] += 1
-        
+
         level = eval("logging.%s" % type.upper())
 
         if self.uncompressed_log:
             logging.log(level, msg)
         else:
             if not value:   # number of line accepted to display (defaut or given to the method)
-                value = 5    
+                value = 5
             if not EMBL.PREVIOUS_ERRORS.has_key(msg_type) or EMBL.PREVIOUS_ERRORS[msg_type] < value:
                 logging.log(level, msg)
                 EMBL.PREVIOUS_ERRORS.setdefault(msg_type,1)
             elif EMBL.PREVIOUS_ERRORS[msg_type] == value:
                 logging.log(level, msg)
                 final_message = 'We will not display anymore this %s. Please use the --uncompressed_log parameter if you wish having all of them.' % type
-                logging.log(level, final_message) 
+                logging.log(level, final_message)
 
 #================== def EMBL line =======================
 #\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/
@@ -679,12 +680,12 @@ def FT(self):
 
             ####################
             # manage locus_tag #
-            
+
             locus_tag=None
             #skip feature without locus_tag
             if feature.type.lower() != "source" and feature.type.lower() != "gap":
 
-                # When option attribute_to_use_as_locus_tag activated, use the value of an attribute defined by the user 
+                # When option attribute_to_use_as_locus_tag activated, use the value of an attribute defined by the user
                 if self.PREVIOUS_VALUES['attribute_to_use_as_locus_tag']: # If we have defined an attribute to use as locus_tag
                     attribute = self.PREVIOUS_VALUES['attribute_to_use_as_locus_tag']
                     for qualifier in feature.qualifiers:
@@ -694,7 +695,7 @@ def FT(self):
                     if not locus_tag: #inform the user that we will use the locus_tag instead
                         msg_type = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist" % (attribute)
                         msg = "I'm suppose to use the value of the attribute %s from the gff3 file as locus_tag but this attribute doesnt exist for feature %s. "\
-                        "Consequently I will use the locus_tag %s to create a proper one." % (attribute, feature.id, self.locus_tag) 
+                        "Consequently I will use the locus_tag %s to create a proper one." % (attribute, feature.id, self.locus_tag)
                         self.handle_message("warning", msg_type, msg, None)
                 # create a locus tag base on the prefix + LOCUS + incremented number
                 if not locus_tag:
@@ -712,8 +713,8 @@ def FT(self):
                     # create locus tag from locus_tag_suffix and accession
                     locus_tag = "%s_%s" % (locus_tag_prefix, locus_tag_suffix)
 
-            f = Feature(feature, self.record.seq, locus_tag, self.transl_table, translate=self.translate, 
-                feature_definition_dir=FEATURE_DIR, qualifier_definition_dir=QUALIFIER_DIR, level=1, 
+            f = Feature(feature, self.record.seq, locus_tag, self.transl_table, translate=self.translate,
+                feature_definition_dir=FEATURE_DIR, qualifier_definition_dir=QUALIFIER_DIR, level=1,
                 reorder_gene_features = self.interleave_genes, force_unknown_features = self.force_unknown_features,
                  force_uncomplete_features = self.force_uncomplete_features, uncompressed_log = self.uncompressed_log, no_wrap_qualifier = self.no_wrap_qualifier)
 
@@ -796,7 +797,7 @@ def SQ(self, out = None):
         lenSeq = len(seq)
         while seqBuilt_len < lenSeq :
             slice_seq = seq[seqBuilt_len:(seqBuilt_len+60)]
-            current_line = " ".join([slice_seq[i*10:(i+1)*10] for i in range(0, 6)]) 
+            current_line = " ".join([slice_seq[i*10:(i+1)*10] for i in range(0, 6)])
             seqBuilt_len += len(slice_seq)
             formatted_line = "\n     %s %s" % ("{:65}".format(current_line), "{:>9}".format(str(seqBuilt_len)))
 
@@ -860,7 +861,7 @@ def set_classification(self, classification = None, strain = None, environmental
                                          "the /environmental_sample qualifier should also contain the /isolation_source qualifier. entries including "\
                                          "/environmental_sample must not include the /strain qualifier)\nStrain:")
                         strain = raw_input()
-                        if strain: 
+                        if strain:
                             EMBL.PREVIOUS_VALUES["strain"]=strain
                             onekey = strain
                         if not strain: #Entry with /environmental_sample must not include the /strain qualifier
@@ -990,7 +991,7 @@ def set_keywords(self, keywords = []):
             self.keywords = EMBL.PREVIOUS_VALUES["keywords"]
         else:
             if not keywords:
-                keywords = [""]   
+                keywords = [""]
             EMBL.PREVIOUS_VALUES["keywords"] = keywords
             self.keywords = keywords
 
@@ -1053,7 +1054,7 @@ def set_organelle(self, organelle = None):
         else:
             if self.verify and organelle:
                 organelle = self._verify( self.organelle, "organelle")
-            
+
             self.organelle = organelle
             EMBL.PREVIOUS_VALUES["organelle"] = organelle
 
@@ -1113,7 +1114,7 @@ def set_taxonomy(self, taxonomy = None):
                     taxonomy = self._verify( taxonomy, "taxonomy")
             else:
                 taxonomy = "XXX"
-            
+
             self.taxonomy = taxonomy
             EMBL.PREVIOUS_VALUES["taxonomy"] = taxonomy
 
@@ -1125,7 +1126,7 @@ def set_topology(self, topology = None):
             self.topology = EMBL.PREVIOUS_VALUES["topology"]
         else:
             if self.verify:
-                topology = self._verify( topology, "topology")      
+                topology = self._verify( topology, "topology")
 
             self.topology = topology
             EMBL.PREVIOUS_VALUES["topology"] = topology
@@ -1136,7 +1137,7 @@ def set_transl_table(self, transl_table = None):
         """
         if "transl_table" in EMBL.PREVIOUS_VALUES:
             self.transl_table = EMBL.PREVIOUS_VALUES["transl_table"]
-        else:    
+        else:
             if self.verify:
                 transl_table = self._verify( transl_table, "transl_table")
 
@@ -1161,7 +1162,7 @@ def set_version(self, version = None):
         """
         if "version" in EMBL.PREVIOUS_VALUES:
             self.version = EMBL.PREVIOUS_VALUES["version"]
-        else:    
+        else:
             if version:
                 version = "SV %i" % version
             else:
@@ -1299,7 +1300,7 @@ def main():
     args = parser.parse_args()
 
     # add a convenience argument. This is because I want the argument to be called
-    # progress in the code, but --no_progress makes more sense as an argument to 
+    # progress in the code, but --no_progress makes more sense as an argument to
     # hide the progress bar.
     args.progress = args.no_progress
 
@@ -1344,7 +1345,7 @@ def main():
         infile.seek(0, 0)
 
     for record in GFF.parse(infile, base_dict=seq_dict):
-        
+
         # Check existence of gff seqid among the fasta sequence identifiers
         if record.id not in seq_dict:
             logging.warning("Sequence id <%s> from the gff file not found within the fasta file. Are you sure to provide the correct" \
@@ -1376,7 +1377,7 @@ def main():
         writer.set_attribute_to_use_as_locus_tag( args.use_attribute_value_as_locus_tag ) #has to be before set_locus_tag
         writer.set_locus_tag( args.locus_tag )
         writer.set_locus_numbering_start(args.locus_numbering_start)
-        
+
         writer.set_molecule_type( args.molecule_type )
         writer.set_no_wrap_qualifier( args.no_wrap_qualifier)
         writer.set_organelle( args.organelle )
@@ -1387,15 +1388,15 @@ def main():
         writer.set_translation(args.translate)
         writer.set_uncompressed_log(args.uncompressed_log)
         writer.set_version( args.version )
-        
+
 
         writer.add_reference(args.rt, location = args.rl, comment = args.rc, xrefs = args.rx, group = args.rg, authors = args.ra)
 
         write_start = time.time()
         writer.write_all( outfile )
 
         writer = None
-    if args.progress:    
+    if args.progress:
         EMBL.print_progress(True)
 
     sys.stderr.write( """Conversion done\n""")
diff --git a/EMBLmyGFF3/modules/feature.py b/EMBLmyGFF3/modules/feature.py
@@ -67,7 +67,7 @@ class Feature(object):
 
     def __init__(self, feature = None, seq = None, accessions = [], transl_table = 1, translation_files = [], translate = False,
                 feature_definition_dir = DEFAULT_FEATURE_DIR, qualifier_definition_dir = DEFAULT_QUALIFIER_DIR, format_data = True,
-                level = 0, reorder_gene_features = True, skip_feature = False, force_unknown_features = False, 
+                level = 0, reorder_gene_features = True, skip_feature = False, force_unknown_features = False,
                 force_uncomplete_features = False, uncompressed_log = None, no_wrap_qualifier = False):
         """
         Initializes a Feature, loads json files for feature and
@@ -90,7 +90,7 @@ def __init__(self, feature = None, seq = None, accessions = [], transl_table = 1
         self.reorder_gene_features = reorder_gene_features
         self.remove = []
         self.seq = seq
-        self.singleton_types = ["exon"]
+        self.combine_types = ["CDS","3'UTR","5'UTR"]
         self.skip_feature = skip_feature
         self.sub_features = []
         self.translate = translate
@@ -105,15 +105,15 @@ def __init__(self, feature = None, seq = None, accessions = [], transl_table = 1
         self._load_data(feature, accessions)
         self._check_qualifier(feature)
 
-        if level == 1:
+        if level == 1 :
             # Parse through subfeatures level2
             featureObj_level2 = None
             for feature_l2 in feature.sub_features:
                 featureObj_level2 = Feature(feature_l2, self.seq, accessions, self.transl_table, self.translation_files, self.translate,
                                                       self.feature_definition_dir, self.qualifier_definition_dir, format_data = True, level=2)
                 self.sub_features += [featureObj_level2]
 
-                
+
                 # Sort the sub-features in case they were not ordered into the gff3 file (issue 1)
                 feature_l2.sub_features.sort(key=lambda x: x.location.start)
 
@@ -122,7 +122,7 @@ def __init__(self, feature = None, seq = None, accessions = [], transl_table = 1
                 for feature_l3 in feature_l2.sub_features:
                     l3_type = self._from_gff_feature(feature_l3.type)
                     l2_sub_features = [sf.type for sf in featureObj_level2.sub_features]
-                    if l3_type in l2_sub_features and l3_type not in self.singleton_types:
+                    if l3_type in l2_sub_features and l3_type in self.combine_types:
                         old_feature = [sf for sf in featureObj_level2.sub_features if sf.type == self._from_gff_feature(feature_l3.type)][0]
                         old_feature.combine(feature_l3)
                     else:
@@ -251,15 +251,15 @@ def _infer_ORFs(self, feature):
 
             # basic info
             strand = self.location.strand
-            # raise an error if no strand for the CDS. Strand is not mandatory (can be a dot) except for CDS where it has an 
+            # raise an error if no strand for the CDS. Strand is not mandatory (can be a dot) except for CDS where it has an
             # impact on the translation, and to check where is start and stop codon...
             if strand == None:
                 ID=''
                 for qualifier in self.feature.qualifiers:
                     if 'id' == qualifier.lower():
                         ID =  "%s" % " ".join(self.feature.qualifiers[qualifier])
                         break
-                logging.error('CDS %s does not have any strand! Please check your gff file.'  %  ID) 
+                logging.error('CDS %s does not have any strand! Please check your gff file.'  %  ID)
                 sys.exit()
 
             if start_codon.upper() not in codon_table.start_codons:
@@ -278,7 +278,7 @@ def _check_qualifier(self, feature):
             # Check presence of mandatory qualifier
             if self.qualifiers[qualifier].mandatory:# Check if the qualifier is mandatory
                 if not self.qualifiers[qualifier].value: # No value for this mandatory qualifier
-                    
+
                     msg = "The qualifier >%s< is mandatory for the feature >%s<. We will not report the feature." % (qualifier, self.type)
                     self.handle_message('warning', msg, msg, None)
 
@@ -462,7 +462,7 @@ def add_qualifier(self, gff_qualifier, value):
                 error_regex = True
 
             if error_regex:
-                
+
                 msg_type = "The value(s) is(are) invalid for the qualifier %s of the feature %s. We will not report the qualifier. (Here is the regex expected: %s)"  % (qualifier, self.type, regex)
                 msg = "The value(s) %s is(are) invalid for the qualifier %s of the feature %s. We will not report the qualifier. (Here is the regex expected: %s)"  % (value, qualifier, self.type, regex)
                 self.handle_message("warning", msg_type, msg, None)
@@ -479,7 +479,7 @@ def add_qualifier(self, gff_qualifier, value):
             value = ["%s%s" % (v, self.qualifier_suffix[gff_qualifier]) for v in value]
 
         ###########################################
-        # add the value only if not already present         
+        # add the value only if not already present
         # List case
         if isinstance(value, list):
             for val in value:
@@ -580,7 +580,7 @@ def translation(self):
         U = "Sec";  Selenocysteine
         O = "Pyl";  Pyrrolysine
         """
-        codon_table = CodonTable.ambiguous_dna_by_id[self.transl_table]  
+        codon_table = CodonTable.ambiguous_dna_by_id[self.transl_table]
         seq = Seq(str(self.sequence()),IUPACAmbiguousDNA())
 
         #start translation according to the phase. Phase and codon_start are not the same coordinate system. It is why we have to remove 1
@@ -590,7 +590,7 @@ def translation(self):
 
         #check if multiple of three
         remaining_nuc = len(seq)%3
-        if remaining_nuc != 0:   
+        if remaining_nuc != 0:
             #create warning
             ID=''
             for qualifier in self.feature.qualifiers:
@@ -601,7 +601,7 @@ def translation(self):
 
         #translate the sequence in AA with normal frame even if stop inside
         translated_seq = seq.translate(codon_table).tostring().replace('B','X').replace('Z','X').replace('J','X')
-        
+
         #Extra check about stop codon in CDS
         if '*' in translated_seq[:-1]: # check if premature stop codon in the translation
             ID=''
@@ -618,24 +618,24 @@ def translation(self):
         return translated_seq
 
     def handle_message(self, type, msg_type, msg, value):
-    
+
         if Feature.PREVIOUS_ERRORS.has_key(msg_type):
             Feature.PREVIOUS_ERRORS[msg_type] += 1
-        
+
         level = eval("logging.%s" % type.upper())
 
         if self.uncompressed_log:
             logging.log(level, msg)
         else:
             if not value:   # number of line accepted to display (defaut or given to the method)
-                value = 5    
+                value = 5
             if not Feature.PREVIOUS_ERRORS.has_key(msg_type) or Feature.PREVIOUS_ERRORS[msg_type] < value:
                 logging.log(level, msg)
                 Feature.PREVIOUS_ERRORS.setdefault(msg_type,1)
             elif Feature.PREVIOUS_ERRORS[msg_type] == value:
                 logging.log(level, msg)
                 final_message = 'We will not display anymore this %s. Please use the --uncompressed_log parameter if you wish having all of them.' % type
-                logging.log(level, final_message) 
+                logging.log(level, final_message)
 
 
 if __name__ == '__main__':