Skip to content

Commit

Permalink
Merge branch 'master' of github.com:gamcil/cblaster
Browse files Browse the repository at this point in the history
  • Loading branch information
gamcil committed Jan 5, 2023
2 parents 7bf4d91 + 9feb9d4 commit cb6dc9d
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[![Documentation Status](https://readthedocs.org/projects/cblaster/badge/?version=latest)](https://cblaster.readthedocs.io/en/latest/?badge=latest)
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3660769.svg)](https://doi.org/10.5281/zenodo.3660769)

>**New!**
>**(Temporarily down)**
Both cblaster and clinker can now be used without installation on the [CAGECAT webserver](http://cagecat.bioinformatics.nl/).

## Outline
Expand Down
2 changes: 1 addition & 1 deletion cblaster/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.3.16"
__version__ = "1.3.17"
4 changes: 2 additions & 2 deletions cblaster/extract_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,15 +257,15 @@ def efetch_nucleotide_sequence(cluster_hierarchy):

LOG.info(
f"Querying NCBI for sequence of {scaffold.accession}"
f" from {cluster.intermediate_start}"
f" from {cluster.intermediate_start + 1}"
f" to {cluster.intermediate_end}"
)
response = requests.post(
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",
params={
"db": "sequences",
"rettype": "fasta",
"seq_start": str(cluster.intermediate_start),
"seq_start": str(cluster.intermediate_start + 1), # Convert to 1-based
"seq_stop": str(cluster.intermediate_end),
"strand": "1",
},
Expand Down
2 changes: 1 addition & 1 deletion cblaster/genome_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def find_gene_name(qualifiers):
"""Finds a gene name in a dictionary of feature qualifiers."""
if not isinstance(qualifiers, dict):
raise TypeError("Expected qualifier dictionary")
for tag in ["locus_tag", "protein_id", "id", "gene", "name", "label"]:
for tag in ["protein_id", "locus_tag", "id", "gene", "name", "label"]:
if tag in qualifiers:
return qualifiers[tag][0]
return "N.A."
Expand Down
36 changes: 29 additions & 7 deletions cblaster/intermediate_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,21 @@
PROTEIN_NAME_IDENTIFIERS = ("protein_id", "locus_tag", "gene", "ID", "Name", "label")


def parse_gene_distance(gene_distance):
"""Parse left/right (up/downstream of cluster) gene distance.
Expects int or (int, int), e.g.:
5000 --> left=5000, right=5000
(5000, 10000) --> left=5000, right=10000
"""
if isinstance(gene_distance, int):
return gene_distance, gene_distance
if isinstance(gene_distance, tuple):
if len(gene_distance) > 2:
raise ValueError("gene_distance must be int or (int, int)")
return gene_distance
raise ValueError("Failed to parse intermediate gene distance value")


def set_local_intermediate_genes(sqlite_db, cluster_hierarchy, gene_distance):
"""Adds intermediate genes to clusters in the cluster_hierarchy using a SQLite database
Expand All @@ -32,10 +47,11 @@ def set_local_intermediate_genes(sqlite_db, cluster_hierarchy, gene_distance):
cluster_hierarchy (List): Tuples with cblaster cluster scaffold accession and organism name
gene_distance (int): the extra distance around a cluster to collect genes from
"""
left, right = parse_gene_distance(gene_distance)
for cluster, scaffold, organism in cluster_hierarchy:
scaffold_accession = scaffold.accession
search_start = cluster.start - gene_distance
search_stop = cluster.end + gene_distance
search_start = cluster.start - left
search_stop = cluster.end + right
cluster_ids = [subject.id for subject in cluster.subjects]
cluster.intermediate_genes = [
Subject(id=id, name=name, start=start, end=end, strand=strand)
Expand All @@ -56,7 +72,7 @@ def intermediate_genes_request(accession, start, stop):
params={
"db": "nuccore",
"rettype": "gb",
"from": str(start),
"from": str(start + 1), # 1-based for efetch
"to": str(stop),
},
files={"id": accession},
Expand All @@ -70,7 +86,7 @@ def intermediate_genes_request(accession, start, stop):
return response


def intermediate_genes_genbank(response):
def intermediate_genes_genbank(response, search_start):
features = []
for record in SeqIO.parse(io.StringIO(response.text), "genbank"):
subjects = []
Expand All @@ -79,6 +95,11 @@ def intermediate_genes_genbank(response):
subject = tuple_to_subject(feature)
except TypeError:
continue
# Must add search start to each subject start/end, since sequence
# features in GenBank returned by intermediate_genes_request are
# relative (e.g. cluster 4000-10000, but genes start at 0)
subject.start += search_start
subject.end += search_start
subjects.append(subject)
features.extend(subjects)
return features
Expand All @@ -105,16 +126,17 @@ def set_remote_intermediate_genes(cluster_hierarchy, gene_distance):
cluster_hierarchy (List): Tuples with Cblaster cluster scaffold accession and organism name
gene_distance (int): the extra distance around a cluster to collect genes from
"""
left, right = parse_gene_distance(gene_distance)
passed_time = 0
for cluster, scaffold, _ in cluster_hierarchy:
scaffold_accession = scaffold.accession
if passed_time < MIN_TIME_BETWEEN_REQUEST:
time.sleep(MIN_TIME_BETWEEN_REQUEST - passed_time)
search_start = max(0, cluster.start - gene_distance)
search_stop = cluster.end + gene_distance
search_start = max(0, cluster.start - left)
search_stop = cluster.end + right
start_time = time.time()
response = intermediate_genes_request(scaffold_accession, search_start, search_stop)
subjects = intermediate_genes_genbank(response)
subjects = intermediate_genes_genbank(response, search_start)
cluster.intermediate_genes = get_remote_intermediate_genes(subjects, cluster)
passed_time = time.time() - start_time

Expand Down

0 comments on commit cb6dc9d

Please sign in to comment.