implement bakta_io backward compatibility

oschwengers · Dec 2, 2024 · e068540 · e068540
1 parent 9a421f9
commit e068540
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 45 deletions.
diff --git a/bakta/io/fasta.py b/bakta/io/fasta.py
@@ -70,7 +70,7 @@ def export_sequences(sequences: Sequence[dict], fasta_path: Path, description: b
             else:
                 fh.write(f">{seq['id']}\n")
             if(wrap):
-                fh.write(wrap_sequence(seq['nt']))
+                fh.write(wrap_sequence(seq['nt'] if 'nt' in seq else seq['sequence']))  # <1.10.0 compatibility
             else:
                 fh.write(seq['nt'])
                 fh.write('\n')

diff --git a/bakta/io/gff.py b/bakta/io/gff.py
@@ -23,7 +23,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
         fh.write('##gff-version 3\n')  # GFF version
         fh.write('##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo\n')  # SO feature version
 
-        if(data['genome']['taxon']):  # write organism info
+        if(data['genome'].get('taxon', None)):  # write organism info
             fh.write(f"# organism {data['genome']['taxon']}\n")
 
         fh.write('# Annotated with Bakta\n')
@@ -46,6 +46,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
             fh.write(f"{seq['id']}\tBakta\tregion\t1\t{str(seq['length'])}\t.\t+\t.\t{annotations}\n")
 
             for feat in features_by_sequence[seq['id']]:
+                seq_id = seq_id if 'sequence' in feat else feat['contig']  # <1.10.0 compatibility
                 start = feat['start']
                 stop = feat['stop']
                 if('edge' in feat):
@@ -83,9 +84,9 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         if(bc.PSEUDOGENE in feat):
                             gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['sequence']}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{seq_id}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['sequence']}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{seq_id}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_TM_RNA):
                     annotations = {
                         'ID': feat['locus'],
@@ -114,9 +115,9 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         if('truncated' in feat):
                             gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['sequence']}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{seq_id}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['sequence']}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{seq_id}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_R_RNA):
                     annotations = {
                         'ID': feat['locus'],
@@ -142,9 +143,9 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         if('truncated' in feat):
                             gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['sequence']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{seq_id}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{seq_id}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_NC_RNA):
                     annotations = {
                         'ID': feat['locus'],
@@ -175,9 +176,9 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         if('truncated' in feat):
                             gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['sequence']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{seq_id}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{seq_id}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_NC_RNA_REGION):
                     annotations = {
                         'ID': feat['id'],
@@ -193,7 +194,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                         annotations[bc.INSDC_FEATURE_REGULATORY_CLASS] = insdc.select_regulatory_class(feat)
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{seq_id}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_CRISPR):
                     annotations = {
                         'ID': feat['id'],
@@ -209,7 +210,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         annotations[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct'
                         annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus']
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['sequence']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{seq_id}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                     if(not cfg.compliant):
                         i = 0
                         while i < len(feat['spacers']):
@@ -219,21 +220,21 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                                 'Parent': feat['id']
                             }
                             annotations = encode_annotations(annotations)
-                            fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
+                            fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                             spacer = feat['spacers'][i]
                             annotations = {
                                 'ID': f"{feat['id']}_spacer_{i+1}",
                                 'Parent': feat['id'],
                                 'sequence': spacer['sequence']
                             }
                             annotations = encode_annotations(annotations)
-                            fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
+                            fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
                             i += 1
                         if(len(feat['repeats']) - 1 == i):
                             repeat = feat['repeats'][i]
                             annotations = { 'ID': f"{feat['id']}_repeat_{i+1}" }
                             annotations = encode_annotations(annotations)
-                            fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
+                            fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_CDS):
                     annotations = {
                         'ID': feat['locus'],
@@ -266,7 +267,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         if(bc.PSEUDOGENE in feat):
                             gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['sequence']}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{seq_id}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     if('exception' in feat):
                         ex = feat['exception']
                         pos = f"{ex['start']}..{ex['stop']}"
@@ -278,7 +279,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         if('Notes' not in annotations):
                             annotations['Note'] = notes
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['sequence']}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
+                    fh.write(f"{seq_id}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                     if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                         write_signal_peptide(fh, feat)
                 elif(feat['type'] == bc.FEATURE_SORF):
@@ -306,9 +307,9 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         if(feat.get('gene', None)):
                             gene_annotations['gene'] = feat['gene']
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['sequence']}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{seq_id}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['sequence']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
+                    fh.write(f"{seq_id}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                     if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                         write_signal_peptide(fh, feat)
                 elif(feat['type'] == bc.FEATURE_GAP):
@@ -318,7 +319,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         'product': f"gap ({feat['length']} bp)"
                     }
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['sequence']}\tBakta\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{seq_id}\tBakta\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_ORIC):
                     annotations = {
                         'ID': feat['id'],
@@ -331,7 +332,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         annotations['inference'] = 'similar to DNA sequence'
                     annotations = encode_annotations(annotations)
                     feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
-                    fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_ORIV):
                     annotations = {
                         'ID': feat['id'],
@@ -344,7 +345,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         annotations['inference'] = 'similar to DNA sequence'
                     annotations = encode_annotations(annotations)
                     feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
-                    fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_ORIT):
                     annotations = {
                         'ID': feat['id'],
@@ -357,13 +358,14 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
                         annotations['inference'] = 'similar to DNA sequence'
                     annotations = encode_annotations(annotations)
                     feat_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER if cfg.compliant else so.SO_ORIT.name
-                    fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
 
         if(not cfg.compliant):
             fh.write('##FASTA\n')
             for seq in data['sequences']:  # write sequences
                 fh.write(f">{seq['id']}\n")
-                fh.write(fasta.wrap_sequence(seq['nt']))
+                seq_nt = seq['nt'] if 'nt' in seq else seq['sequence']  # <1.10.0 compatibility
+                fh.write(fasta.wrap_sequence(seq_nt))
     return
 
 
@@ -391,7 +393,7 @@ def encode_annotations(annotations: Dict[str, Union[str, Sequence[str]]]) -> str
     return ';'.join(annotation_strings)
 
 
-def write_signal_peptide(fh, feat: dict):
+def write_signal_peptide(fh, feat: dict):  # <1.10.0 compatibility
     sig_peptide = feat[bc.FEATURE_SIGNAL_PEPTIDE]
     annotations = {
         'ID': f"{feat['locus']}_sigpep",
@@ -401,4 +403,5 @@ def write_signal_peptide(fh, feat: dict):
         'Parent': feat['locus']
     }
     annotations = encode_annotations(annotations)
-    fh.write(f"{feat['sequence']}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")
+    seq_id = seq_id if 'sequence' in feat else feat['contig']
+    fh.write(f"{seq_id}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")
diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
@@ -22,7 +22,7 @@
 def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
     sequence_list = []
     for seq in data['sequences']:
-        sequence_features = [feat for feat in features if feat['sequence'] == seq['id']]
+        sequence_features = [feat for feat in features if feat['sequence'] == seq['id']] if 'sequence' in features[0] else [feat for feat in features if feat['contig'] == seq['id']]  # <1.10.0 compatibility
         comment = (
             'Annotated with Bakta',
             f"Software: v{bakta.__version__}\n",
@@ -46,7 +46,7 @@ def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
         )
         sequence_annotations = {
             'molecule_type': 'DNA',
-            'source': data['genome']['taxon'],
+            'source': data['genome'].get('taxon', ''),
             'date': date.today().strftime('%d-%b-%Y').upper(),
             'topology': seq['topology'],
             'data_file_division': 'HGT' if seq['type'] == bc.REPLICON_CONTIG else 'BCT',
@@ -60,7 +60,7 @@ def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
         }
 
         description = ''
-        if(data['genome']['taxon']):
+        if(data['genome'].get('taxon', None)):
             sequence_annotations['organism'] = data['genome']['taxon']
             source_qualifiers['organism'] = data['genome']['taxon']
             description = data['genome']['taxon']
@@ -81,7 +81,8 @@ def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
         if(len(description) > 0 and description[0] == ' '):  # discard potential leading whitespace
             description = description[1:]
 
-        sequence_record = SeqIO.SeqRecord(id=seq['id'], name=seq['id'], description=description, annotations=sequence_annotations, seq=Seq(seq['nt']))
+        seq_bio = Seq(seq['nt']) if 'nt' in seq else Seq(seq['sequence'])  # <1.10.0 compatibility
+        sequence_record = SeqIO.SeqRecord(id=seq['id'], name=seq['id'], description=description, annotations=sequence_annotations, seq=seq_bio)
 
         source = SeqFeature(FeatureLocation(0, seq['length'], strand=+1), type='source', qualifiers=source_qualifiers)
         seq_feature_list = [source]
@@ -189,8 +190,8 @@ def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
                 qualifiers['inference'] = 'profile:aragorn:1.2'
                 if('tag' in feature):
                     qualifiers['tag_peptide'] = f"{feature['tag']['start']}..{feature['tag']['stop']}"
-                if feature['strand'] == bc.STRAND_REVERSE:
-                    qualifiers['tag_peptide'] = f"complement({qualifiers['tag_peptide']})"
+                    if feature['strand'] == bc.STRAND_REVERSE:
+                        qualifiers['tag_peptide'] = f"complement({qualifiers['tag_peptide']})"
                 insdc_feature_type = bc.INSDC_FEATURE_TM_RNA
             elif(feature['type'] == bc.FEATURE_R_RNA):
                 for rfam_id in [dbxref.split(':')[1] for dbxref in feature['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]: