Skip to content

Commit

Permalink
implement bakta_io backward compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Dec 2, 2024
1 parent 9a421f9 commit e068540
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 45 deletions.
2 changes: 1 addition & 1 deletion bakta/io/fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def export_sequences(sequences: Sequence[dict], fasta_path: Path, description: b
else:
fh.write(f">{seq['id']}\n")
if(wrap):
fh.write(wrap_sequence(seq['nt']))
fh.write(wrap_sequence(seq['nt'] if 'nt' in seq else seq['sequence'])) # <1.10.0 compatibility
else:
fh.write(seq['nt'])
fh.write('\n')
Expand Down
53 changes: 28 additions & 25 deletions bakta/io/gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
fh.write('##gff-version 3\n') # GFF version
fh.write('##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo\n') # SO feature version

if(data['genome']['taxon']): # write organism info
if(data['genome'].get('taxon', None)): # write organism info
fh.write(f"# organism {data['genome']['taxon']}\n")

fh.write('# Annotated with Bakta\n')
Expand All @@ -46,6 +46,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
fh.write(f"{seq['id']}\tBakta\tregion\t1\t{str(seq['length'])}\t.\t+\t.\t{annotations}\n")

for feat in features_by_sequence[seq['id']]:
seq_id = seq_id if 'sequence' in feat else feat['contig'] # <1.10.0 compatibility
start = feat['start']
stop = feat['stop']
if('edge' in feat):
Expand Down Expand Up @@ -83,9 +84,9 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
if(bc.PSEUDOGENE in feat):
gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
gene_annotations = encode_annotations(gene_annotations)
fh.write(f"{feat['sequence']}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
fh.write(f"{seq_id}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_TM_RNA):
annotations = {
'ID': feat['locus'],
Expand Down Expand Up @@ -114,9 +115,9 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
if('truncated' in feat):
gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
gene_annotations = encode_annotations(gene_annotations)
fh.write(f"{feat['sequence']}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
fh.write(f"{seq_id}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_R_RNA):
annotations = {
'ID': feat['locus'],
Expand All @@ -142,9 +143,9 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
if('truncated' in feat):
gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
gene_annotations = encode_annotations(gene_annotations)
fh.write(f"{feat['sequence']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
fh.write(f"{seq_id}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_NC_RNA):
annotations = {
'ID': feat['locus'],
Expand Down Expand Up @@ -175,9 +176,9 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
if('truncated' in feat):
gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
gene_annotations = encode_annotations(gene_annotations)
fh.write(f"{feat['sequence']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
fh.write(f"{seq_id}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_NC_RNA_REGION):
annotations = {
'ID': feat['id'],
Expand All @@ -193,7 +194,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs']) # remove INSDC invalid DbXrefs
annotations[bc.INSDC_FEATURE_REGULATORY_CLASS] = insdc.select_regulatory_class(feat)
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_CRISPR):
annotations = {
'ID': feat['id'],
Expand All @@ -209,7 +210,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
annotations[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct'
annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus']
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
if(not cfg.compliant):
i = 0
while i < len(feat['spacers']):
Expand All @@ -219,21 +220,21 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
'Parent': feat['id']
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
spacer = feat['spacers'][i]
annotations = {
'ID': f"{feat['id']}_spacer_{i+1}",
'Parent': feat['id'],
'sequence': spacer['sequence']
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
i += 1
if(len(feat['repeats']) - 1 == i):
repeat = feat['repeats'][i]
annotations = { 'ID': f"{feat['id']}_repeat_{i+1}" }
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_CDS):
annotations = {
'ID': feat['locus'],
Expand Down Expand Up @@ -266,7 +267,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
if(bc.PSEUDOGENE in feat):
gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
gene_annotations = encode_annotations(gene_annotations)
fh.write(f"{feat['sequence']}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
fh.write(f"{seq_id}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
if('exception' in feat):
ex = feat['exception']
pos = f"{ex['start']}..{ex['stop']}"
Expand All @@ -278,7 +279,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
if('Notes' not in annotations):
annotations['Note'] = notes
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
fh.write(f"{seq_id}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
write_signal_peptide(fh, feat)
elif(feat['type'] == bc.FEATURE_SORF):
Expand Down Expand Up @@ -306,9 +307,9 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
if(feat.get('gene', None)):
gene_annotations['gene'] = feat['gene']
gene_annotations = encode_annotations(gene_annotations)
fh.write(f"{feat['sequence']}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
fh.write(f"{seq_id}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
fh.write(f"{seq_id}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
write_signal_peptide(fh, feat)
elif(feat['type'] == bc.FEATURE_GAP):
Expand All @@ -318,7 +319,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
'product': f"gap ({feat['length']} bp)"
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tBakta\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tBakta\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_ORIC):
annotations = {
'ID': feat['id'],
Expand All @@ -331,7 +332,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
annotations['inference'] = 'similar to DNA sequence'
annotations = encode_annotations(annotations)
feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_ORIV):
annotations = {
'ID': feat['id'],
Expand All @@ -344,7 +345,7 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
annotations['inference'] = 'similar to DNA sequence'
annotations = encode_annotations(annotations)
feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] == bc.FEATURE_ORIT):
annotations = {
'ID': feat['id'],
Expand All @@ -357,13 +358,14 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
annotations['inference'] = 'similar to DNA sequence'
annotations = encode_annotations(annotations)
feat_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER if cfg.compliant else so.SO_ORIT.name
fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
fh.write(f"{seq_id}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")

if(not cfg.compliant):
fh.write('##FASTA\n')
for seq in data['sequences']: # write sequences
fh.write(f">{seq['id']}\n")
fh.write(fasta.wrap_sequence(seq['nt']))
seq_nt = seq['nt'] if 'nt' in seq else seq['sequence'] # <1.10.0 compatibility
fh.write(fasta.wrap_sequence(seq_nt))
return


Expand Down Expand Up @@ -391,7 +393,7 @@ def encode_annotations(annotations: Dict[str, Union[str, Sequence[str]]]) -> str
return ';'.join(annotation_strings)


def write_signal_peptide(fh, feat: dict):
def write_signal_peptide(fh, feat: dict): # <1.10.0 compatibility
sig_peptide = feat[bc.FEATURE_SIGNAL_PEPTIDE]
annotations = {
'ID': f"{feat['locus']}_sigpep",
Expand All @@ -401,4 +403,5 @@ def write_signal_peptide(fh, feat: dict):
'Parent': feat['locus']
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['sequence']}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")
seq_id = seq_id if 'sequence' in feat else feat['contig']
fh.write(f"{seq_id}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")
13 changes: 7 additions & 6 deletions bakta/io/insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
sequence_list = []
for seq in data['sequences']:
sequence_features = [feat for feat in features if feat['sequence'] == seq['id']]
sequence_features = [feat for feat in features if feat['sequence'] == seq['id']] if 'sequence' in features[0] else [feat for feat in features if feat['contig'] == seq['id']] # <1.10.0 compatibility
comment = (
'Annotated with Bakta',
f"Software: v{bakta.__version__}\n",
Expand All @@ -46,7 +46,7 @@ def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
)
sequence_annotations = {
'molecule_type': 'DNA',
'source': data['genome']['taxon'],
'source': data['genome'].get('taxon', ''),
'date': date.today().strftime('%d-%b-%Y').upper(),
'topology': seq['topology'],
'data_file_division': 'HGT' if seq['type'] == bc.REPLICON_CONTIG else 'BCT',
Expand All @@ -60,7 +60,7 @@ def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
}

description = ''
if(data['genome']['taxon']):
if(data['genome'].get('taxon', None)):
sequence_annotations['organism'] = data['genome']['taxon']
source_qualifiers['organism'] = data['genome']['taxon']
description = data['genome']['taxon']
Expand All @@ -81,7 +81,8 @@ def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
if(len(description) > 0 and description[0] == ' '): # discard potential leading whitespace
description = description[1:]

sequence_record = SeqIO.SeqRecord(id=seq['id'], name=seq['id'], description=description, annotations=sequence_annotations, seq=Seq(seq['nt']))
seq_bio = Seq(seq['nt']) if 'nt' in seq else Seq(seq['sequence']) # <1.10.0 compatibility
sequence_record = SeqIO.SeqRecord(id=seq['id'], name=seq['id'], description=description, annotations=sequence_annotations, seq=seq_bio)

source = SeqFeature(FeatureLocation(0, seq['length'], strand=+1), type='source', qualifiers=source_qualifiers)
seq_feature_list = [source]
Expand Down Expand Up @@ -189,8 +190,8 @@ def build_biopython_sequence_list(data: dict, features: Sequence[dict]):
qualifiers['inference'] = 'profile:aragorn:1.2'
if('tag' in feature):
qualifiers['tag_peptide'] = f"{feature['tag']['start']}..{feature['tag']['stop']}"
if feature['strand'] == bc.STRAND_REVERSE:
qualifiers['tag_peptide'] = f"complement({qualifiers['tag_peptide']})"
if feature['strand'] == bc.STRAND_REVERSE:
qualifiers['tag_peptide'] = f"complement({qualifiers['tag_peptide']})"
insdc_feature_type = bc.INSDC_FEATURE_TM_RNA
elif(feature['type'] == bc.FEATURE_R_RNA):
for rfam_id in [dbxref.split(':')[1] for dbxref in feature['db_xrefs'] if dbxref.split(':')[0] == bc.DB_XREF_RFAM]:
Expand Down
Loading

0 comments on commit e068540

Please sign in to comment.