Skip to content

Commit

Permalink
Merge pull request #60 from apriltuesday/ftp-upload
Browse files Browse the repository at this point in the history
Add FTP upload
  • Loading branch information
kdp-cloud authored Nov 6, 2024
2 parents ab7c5f7 + 74a5404 commit 8d7dcde
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 1 deletion.
2 changes: 2 additions & 0 deletions mars-cli/generate_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ def create_settings_file(settings_dir):
config["ena"] = {
"development-url": "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2/",
"development-submission-url": "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA",
"development-data-submission-url": "webin2.ebi.ac.uk",
"production-url": "https://www.ebi.ac.uk/ena/submit/webin-v2/",
"production-submission-url": "https://www.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA",
"production-data-submission-url": "webin2.ebi.ac.uk",
}

config["biosamples"] = {
Expand Down
34 changes: 34 additions & 0 deletions mars-cli/mars_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@
"development-submission-url",
fallback="https://wwwdev.ebi.ac.uk/biosamples/samples/submit",
),
"DATA-SUBMISSION": config.get(
"ena",
"development-data-submission-url",
fallback="webin2.ebi.ac.uk",
),
},
"WEBIN": {
"SERVICE": config.get(
Expand Down Expand Up @@ -101,6 +106,11 @@
"production-submission-url",
fallback="https://www.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA",
),
"DATA-SUBMISSION": config.get(
"ena",
"development-data-submission-url",
fallback="webin2.ebi.ac.uk",
),
},
"WEBIN": {
"SERVICE": config.get(
Expand Down Expand Up @@ -167,6 +177,23 @@ def cli(ctx, development):
)
@click.argument("isa_json_file", type=click.File("r"))
@click.option("--submit-to-ena", type=click.BOOL, default=True, help="Submit to ENA.")
@click.option(
"--file-transfer",
type=click.STRING,
help="provide the name of a file transfer solution, like ftp or aspera",
)
@click.option(
"--data-files",
type=click.File("r"),
multiple=True,
help="Path of files to upload",
)
# @click.option(
# "--data-submit-to-ena",
# type=click.BOOL,
# default=False,
# help="Submit data files to ENA.",
# )
@click.option(
"--submit-to-metabolights",
type=click.BOOL,
Expand All @@ -189,6 +216,8 @@ def submit(
submit_to_ena,
submit_to_metabolights,
investigation_is_root,
file_transfer,
data_files,
):
"""Start a submission to the target repositories."""
target_repositories = [TargetRepository.BIOSAMPLES]
Expand All @@ -209,6 +238,9 @@ def submit(
)

urls_dict = ctx.obj["FILTERED_URLS"]

data_file_paths = [f.name for f in data_files] if file_transfer else []

try:
submission(
credential_service_name,
Expand All @@ -218,6 +250,8 @@ def submit(
target_repositories,
investigation_is_root,
urls_dict,
file_transfer,
data_file_paths,
)
except requests.RequestException as err:
tb = sys.exc_info()[2] # Traceback value
Expand Down
59 changes: 59 additions & 0 deletions mars-cli/mars_lib/ftp_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import ftplib
import os
from pathlib import Path
from typing import List

from retry import retry
from mars_lib.logging import print_and_log


class PatchFTP_TLS(ftplib.FTP_TLS):
"""
Modification from https://stackoverflow.com/questions/14659154/ftpes-session-reuse-required
to work around bug in Python standard library: https://bugs.python.org/issue19500
Explicit FTPS, with shared TLS session
"""

def ntransfercmd(self, cmd, rest=None):
conn, size = ftplib.FTP.ntransfercmd(self, cmd, rest)
if self._prot_p:
conn = self.context.wrap_socket(
conn, server_hostname=self.host, session=self.sock.session
) # this is the fix
return conn, size


class FTPUploader:
def __init__(self, ftp_host: str, username: str, password: str):
self.ftp_host = ftp_host
self.username = username
self.password = password

@retry(exceptions=ftplib.all_errors, tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def upload(self, file_paths: List[Path], target_location: str = "/") -> bool:
# Heuristic to set the expected timeout assuming 10Mb/s upload speed but no less than 30 sec
# and no more than an hour
max_file_size = max([os.path.getsize(f) for f in file_paths])
timeout = min(max(int(max_file_size / 10000000), 30), 3600)
with PatchFTP_TLS() as ftps:
ftps.context.set_ciphers("HIGH:!DH:!aNULL")
ftps.connect(self.ftp_host, port=21, timeout=timeout)
ftps.login(self.username, self.password)
ftps.prot_p()

ftps.cwd(target_location)
previous_content = ftps.nlst()
for file_to_upload in file_paths:
file_name = os.path.basename(file_to_upload)
if file_name in previous_content and ftps.size(
file_name
) == os.path.getsize(file_to_upload):
print_and_log(
f"{file_name} already exists and has the same size on the FTP, skipping"
)
continue
print_and_log(f"Uploading {file_name} to FTP")
with open(file_to_upload, "rb") as open_file:
ftps.storbinary("STOR %s" % file_name, open_file)

return True
40 changes: 39 additions & 1 deletion mars-cli/mars_lib/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
from mars_lib.logging import print_and_log
from pydantic import ValidationError

from mars_lib.ftp_upload import FTPUploader
from pathlib import Path
from typing import List


def submission(
credential_service_name: str,
Expand All @@ -26,6 +30,8 @@ def submission(
target_repositories: list[str],
investigation_is_root: bool,
urls: dict[str, Any],
file_transfer: str,
data_file_paths=None,
):
# If credential manager info found:
# Get password from the credential manager
Expand Down Expand Up @@ -53,7 +59,18 @@ def submission(
f"ISA JSON with investigation '{isa_json.investigation.title}' is valid."
)

if TargetRepository.ENA in target_repositories:
if (
TargetRepository.ENA in target_repositories
and data_file_paths
and file_transfer
):
upload_to_ena(
file_paths=data_file_paths,
user_credentials=user_credentials,
submission_url=urls["ENA"]["DATA-SUBMISSION"],
file_transfer=file_transfer,
)
elif TargetRepository.ENA in target_repositories:
# TODO: Filter out other assays
ena_result = submit_to_ena(
isa_json=isa_json,
Expand All @@ -64,6 +81,7 @@ def submission(
f"Submission to {TargetRepository.ENA} was successful. Result:\n{ena_result.json()}"
)
# TODO: Update `isa_json`, based on the receipt returned

elif TargetRepository.BIOSAMPLES in target_repositories:
# Submit to Biosamples
biosamples_result = submit_to_biosamples(
Expand Down Expand Up @@ -158,6 +176,26 @@ def submit_to_ena(
return result


def upload_to_ena(
file_paths: List[Path],
user_credentials: dict[str, str],
submission_url: str,
file_transfer: str,
):
ALLOWED_FILE_TRANSFER_SOLUTIONS = {"ftp", "aspera"}
file_transfer = file_transfer.lower()

if file_transfer not in ALLOWED_FILE_TRANSFER_SOLUTIONS:
raise ValueError(f"Unsupported transfer protocol: {file_transfer}")
if file_transfer == "ftp":
uploader = FTPUploader(
submission_url,
user_credentials["username"],
user_credentials["password"],
)
uploader.upload(file_paths)


def create_external_references(
biosamples_credentials: dict[str, str],
biosamples_externalReferences: dict[str, Any],
Expand Down
1 change: 1 addition & 0 deletions mars-cli/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ jsonschema
keyring
pydantic
click
retry
22 changes: 22 additions & 0 deletions mars-cli/tests/test_ftp_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import json

import pytest
from pathlib import Path
import ftplib

from mars_lib.ftp_upload import FTPUploader


def test_upload_login_failure():
uploader = FTPUploader("webin2.ebi.ac.uk", "junk", "more junk")
with pytest.raises(ftplib.error_perm, match="530 Login incorrect."):
uploader.upload([Path("./tests/fixtures/not_a_json_file.txt")])


@pytest.mark.skip(reason="Relies on real ENA credentials in test_credentials_example.json")
def test_upload_success():
# For local testing, add ENA username/password to test_credentials_example.json
with open("./tests/test_credentials_example.json") as f:
creds = json.load(f)
uploader = FTPUploader("webin2.ebi.ac.uk", creds["username"], creds["password"])
uploader.upload([Path("../test-data/ENA_TEST2.R1.fastq.gz"), Path("./tests/fixtures/not_a_json_file.txt")])

0 comments on commit 8d7dcde

Please sign in to comment.