Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ensembl reference collections #2977

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bio/reference/ensembl-annotation/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ output:
- Ensemble GTF or GFF3 anotation file
params:
- url: URL from where to download cache data (optional; by default is ``ftp://ftp.ensembl.org/pub``)
- branch: branch of ftp server to download cache data if required (optional; e.g. "plants")
- collection: collection of ftp server to download cache data if required (optional; e.g. "bacteria_0_collection")
19 changes: 16 additions & 3 deletions bio/reference/ensembl-annotation/test/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ rule get_annotation:
release="105",
build="GRCh37",
flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP.
# branch="plants", # optional: specify branch
log:
"logs/get_annotation.log",
cache: "omit-software" # save space and time with between workflow caching (see docs)
Expand All @@ -22,11 +21,25 @@ rule get_annotation_gz:
release="105",
build="GRCh37",
flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP.
# branch="plants", # optional: specify branch
log:
"logs/get_annotation.log",
mauripops marked this conversation as resolved.
Show resolved Hide resolved
cache: "omit-software" # save space and time with between workflow caching (see docs)
wrapper:
"master/bio/reference/ensembl-annotation"


rule get_off_branch_annotation:
output:
"refs/off_branch_annotation.gtf",
params:
url="http://ftp.ensembl.org/pub",
species="bacillus_subtilis_subsp_subtilis_str_168_gca_000009045",
release="59", # note latest release varies with url
build="ASM904v1",
branch="bacteria", # optional for off branch genomes
url="ftp://ftp.ensemblgenomes.org/pub/", # optional set ftp server source
collection="bacteria_0_collection", # optional set collection source for genome
log:
"logs/get_annotation.log",
mauripops marked this conversation as resolved.
Show resolved Hide resolved
cache: "omit-software" # save space and time with between workflow caching (see docs)
wrapper:
"master/bio/reference/ensembl-annotation"
5 changes: 4 additions & 1 deletion bio/reference/ensembl-annotation/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
elif snakemake.params.get("branch"):
branch = snakemake.params.branch + "/"

collection = ""
if snakemake.params.get("collection"):
collection = snakemake.params.collection + "/"
mauripops marked this conversation as resolved.
Show resolved Hide resolved

flavor = snakemake.params.get("flavor", "")
if flavor:
Expand All @@ -49,7 +52,7 @@


url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub")
url = f"{url}/{branch}release-{release}/{out_fmt}/{species}/{species.capitalize()}.{build}.{gtf_release}.{flavor}{suffix}"
url = f"{url}/{branch}release-{release}/{out_fmt}/{collection}{species}/{species.capitalize()}.{build}.{gtf_release}.{flavor}{suffix}"


try:
Expand Down
2 changes: 2 additions & 0 deletions bio/reference/ensembl-sequence/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ output:
- fasta file
params:
- url: URL from where to download cache data (optional; by default is ``ftp://ftp.ensembl.org/pub``)
- branch: branch of ftp server to download cache data if required (optional; e.g. "plants")
- collection: collection of ftp server to download cache data if required (optional; e.g. "bacteria_0_collection")
22 changes: 18 additions & 4 deletions bio/reference/ensembl-sequence/test/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,8 @@ rule get_single_chromosome:
build="R64-1-1",
release="101",
chromosome=["II"], # optional: restrict to one or multiple chromosomes, for multiple see below
# branch="plants", # optional: specify branch
log:
"logs/get_genome.log",
mauripops marked this conversation as resolved.
Show resolved Hide resolved
params:
url="http://ftp.ensembl.org/pub",
cache: "omit-software" # save space and time with between workflow caching (see docs)
wrapper:
"master/bio/reference/ensembl-sequence"
Expand All @@ -40,7 +37,24 @@ rule get_multiple_chromosome:
build="R64-1-1",
release="101",
chromosome=["I", "II"], # optional: restrict to one or multiple chromosomes
# branch="plants", # optional: specify branch
log:
"logs/get_genome.log",
mauripops marked this conversation as resolved.
Show resolved Hide resolved
cache: "omit-software" # save space and time with between workflow caching (see docs)
wrapper:
"master/bio/reference/ensembl-sequence"


rule get_off_branch_genome:
output:
"refs/off_branch_genome.fasta",
params:
species="bacillus_subtilis_subsp_subtilis_str_168_gca_000009045",
datatype="dna",
build="ASM904v1",
release="59", # note latest release varies with url
branch="bacteria", # optional for off branch genomes
url="ftp://ftp.ensemblgenomes.org/pub/", # optional set ftp server source
collection="bacteria_0_collection", # optional set collection source for genome
log:
"logs/get_genome.log",
mauripops marked this conversation as resolved.
Show resolved Hide resolved
cache: "omit-software" # save space and time with between workflow caching (see docs)
Expand Down
17 changes: 13 additions & 4 deletions bio/reference/ensembl-sequence/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,20 @@
elif snakemake.params.get("branch"):
branch = snakemake.params.branch + "/"

collection = ""
if snakemake.params.get("collection"):
collection = snakemake.params.collection + "/"
mauripops marked this conversation as resolved.
Show resolved Hide resolved

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
build=build, release=release
)
if branch == "" or branch == "grch37/":
spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
build=build, release=release
)
else:
spec = ("{build}" if int(release) > 30 else "{build}.{release}").format(
build=build, release=release
)
mauripops marked this conversation as resolved.
Show resolved Hide resolved

suffixes = ""
datatype = snakemake.params.get("datatype", "")
Expand Down Expand Up @@ -52,7 +61,7 @@

url = snakemake.params.get("url", "ftp://ftp.ensembl.org/pub")
spec = spec.format(build=build, release=release)
url_prefix = f"{url}/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"
url_prefix = f"{url}/{branch}release-{release}/fasta/{collection}{species}/{datatype}/{species.capitalize()}.{spec}"

success = False
for suffix in suffixes:
Expand Down
16 changes: 16 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5564,6 +5564,14 @@ def test_ensembl_sequence_chromosomes():
)


@skip_if_not_modified
def test_ensembl_sequence_off_branch():
run(
"bio/reference/ensembl-sequence",
["snakemake", "--cores", "1", "refs/off_branch_genome.fasta", "--use-conda", "-F"],
)


@skip_if_not_modified
def test_ensembl_sequence_chromosome_old_release():
run(
Expand Down Expand Up @@ -5597,6 +5605,14 @@ def test_ensembl_annotation_gtf_gz():
)


@skip_if_not_modified
def test_ensembl_off_branch_annotation_gtf():
run(
"bio/reference/ensembl-annotation",
["snakemake", "--cores", "1", "refs/off_branch_annotation.gtf", "--use-conda", "-F"],
)


@skip_if_not_modified
def test_ensembl_variation():
run(
Expand Down
Loading