From 51f4d11124d189d870d13aa7a8eec1fb7f3033f1 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Thu, 10 Oct 2024 14:41:49 +0200 Subject: [PATCH] remove remnant of terms --- bakta/features/annotation.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py index 4480958c..cf2a254e 100644 --- a/bakta/features/annotation.py +++ b/bakta/features/annotation.py @@ -19,6 +19,7 @@ RE_PROTEIN_NODE = re.compile(r'NODE_', flags=re.IGNORECASE) RE_PROTEIN_POTENTIAL_CONTIG_NAME = re.compile(r'(genome|shotgun)', flags=re.IGNORECASE) RE_PROTEIN_DOMAIN_CONTAINING = re.compile(r'domain-containing protein', flags=re.IGNORECASE) +RE_PROTEIN_REMNANT = re.compile(r'Remnant of ', re.IGNORECASE) RE_PROTEIN_NO_LETTERS = re.compile(r'[^A-Za-z]') RE_PROTEIN_SUSPECT_CHARS_DISCARD = re.compile(r'[.#]') RE_PROTEIN_SUSPECT_CHARS_REPLACE = re.compile(r'[@=?%]') @@ -566,6 +567,11 @@ def revise_cds_product(product: str): if(product != old_product): log.info('fix product: replace FOG ids. new=%s, old=%s', product, old_product) + old_product = product + product = RE_PROTEIN_REMNANT.sub('', product) # remove 'Remnant of's + if(product != old_product): + log.info('fix product: replace remnant ofs. new=%s, old=%s', product, old_product) + old_product = product dufs = [] # replace DUF-containing products for m in RE_DOMAIN_OF_UNKNOWN_FUCTION.finditer(product):