Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into process-isa-json-afte…
Browse files Browse the repository at this point in the history
…r-biosamples
  • Loading branch information
kdp-cloud committed Nov 6, 2024
2 parents 6a91ee0 + 4135a56 commit 94e7279
Show file tree
Hide file tree
Showing 8 changed files with 5,531 additions and 8 deletions.
2 changes: 2 additions & 0 deletions mars-cli/generate_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ def create_settings_file(settings_dir):
config["ena"] = {
"development-url": "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2/",
"development-submission-url": "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA",
"development-data-submission-url": "webin2.ebi.ac.uk",
"production-url": "https://www.ebi.ac.uk/ena/submit/webin-v2/",
"production-submission-url": "https://www.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA",
"production-data-submission-url": "webin2.ebi.ac.uk",
}

config["biosamples"] = {
Expand Down
34 changes: 34 additions & 0 deletions mars-cli/mars_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@
"development-submission-url",
fallback="https://wwwdev.ebi.ac.uk/biosamples/samples/submit",
),
"DATA-SUBMISSION": config.get(
"ena",
"development-data-submission-url",
fallback="webin2.ebi.ac.uk",
),
},
"WEBIN": {
"SERVICE": config.get(
Expand Down Expand Up @@ -101,6 +106,11 @@
"production-submission-url",
fallback="https://www.ebi.ac.uk/ena/submit/drop-box/submit/?auth=ENA",
),
"DATA-SUBMISSION": config.get(
"ena",
"development-data-submission-url",
fallback="webin2.ebi.ac.uk",
),
},
"WEBIN": {
"SERVICE": config.get(
Expand Down Expand Up @@ -173,6 +183,23 @@ def cli(ctx, development):
help="Submit to BioSamples.",
)
@click.option("--submit-to-ena", type=click.BOOL, default=True, help="Submit to ENA.")
@click.option(
"--file-transfer",
type=click.STRING,
help="provide the name of a file transfer solution, like ftp or aspera",
)
@click.option(
"--data-files",
type=click.File("r"),
multiple=True,
help="Path of files to upload",
)
# @click.option(
# "--data-submit-to-ena",
# type=click.BOOL,
# default=False,
# help="Submit data files to ENA.",
# )
@click.option(
"--submit-to-metabolights",
type=click.BOOL,
Expand All @@ -196,6 +223,8 @@ def submit(
submit_to_ena,
submit_to_metabolights,
investigation_is_root,
file_transfer,
data_files,
):
"""Start a submission to the target repositories."""
target_repositories = []
Expand All @@ -214,6 +243,9 @@ def submit(
)

urls_dict = ctx.obj["FILTERED_URLS"]

data_file_paths = [f.name for f in data_files] if file_transfer else []

try:
submission(
credential_service_name,
Expand All @@ -223,6 +255,8 @@ def submit(
target_repositories,
investigation_is_root,
urls_dict,
file_transfer,
data_file_paths,
)
except requests.RequestException as err:
tb = sys.exc_info()[2] # Traceback value
Expand Down
59 changes: 59 additions & 0 deletions mars-cli/mars_lib/ftp_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import ftplib
import os
from pathlib import Path
from typing import List

from retry import retry
from mars_lib.logging import print_and_log


class PatchFTP_TLS(ftplib.FTP_TLS):
"""
Modification from https://stackoverflow.com/questions/14659154/ftpes-session-reuse-required
to work around bug in Python standard library: https://bugs.python.org/issue19500
Explicit FTPS, with shared TLS session
"""

def ntransfercmd(self, cmd, rest=None):
conn, size = ftplib.FTP.ntransfercmd(self, cmd, rest)
if self._prot_p:
conn = self.context.wrap_socket(
conn, server_hostname=self.host, session=self.sock.session
) # this is the fix
return conn, size


class FTPUploader:
def __init__(self, ftp_host: str, username: str, password: str):
self.ftp_host = ftp_host
self.username = username
self.password = password

@retry(exceptions=ftplib.all_errors, tries=3, delay=2, backoff=1.2, jitter=(1, 3))
def upload(self, file_paths: List[Path], target_location: str = "/") -> bool:
# Heuristic to set the expected timeout assuming 10Mb/s upload speed but no less than 30 sec
# and no more than an hour
max_file_size = max([os.path.getsize(f) for f in file_paths])
timeout = min(max(int(max_file_size / 10000000), 30), 3600)
with PatchFTP_TLS() as ftps:
ftps.context.set_ciphers("HIGH:!DH:!aNULL")
ftps.connect(self.ftp_host, port=21, timeout=timeout)
ftps.login(self.username, self.password)
ftps.prot_p()

ftps.cwd(target_location)
previous_content = ftps.nlst()
for file_to_upload in file_paths:
file_name = os.path.basename(file_to_upload)
if file_name in previous_content and ftps.size(
file_name
) == os.path.getsize(file_to_upload):
print_and_log(
f"{file_name} already exists and has the same size on the FTP, skipping"
)
continue
print_and_log(f"Uploading {file_name} to FTP")
with open(file_to_upload, "rb") as open_file:
ftps.storbinary("STOR %s" % file_name, open_file)

return True
38 changes: 37 additions & 1 deletion mars-cli/mars_lib/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
from mars_lib.logging import print_and_log
from pydantic import ValidationError

from mars_lib.ftp_upload import FTPUploader
from pathlib import Path
from typing import List


def save_step_to_file(time_stamp: float, filename: str, isa_json: IsaJson):
dir_path = f"tmp/{str(time_stamp)}"
Expand All @@ -44,6 +48,8 @@ def submission(
target_repositories: list[str],
investigation_is_root: bool,
urls: dict[str, Any],
file_transfer: str,
data_file_paths=None,
):
# If credential manager info found:
# Get password from the credential manager
Expand Down Expand Up @@ -82,7 +88,17 @@ def submission(
):
raise ValueError("No target repository selected.")

if TargetRepository.BIOSAMPLES in target_repositories:
if (
TargetRepository.ENA in target_repositories
and data_file_paths
and file_transfer
):
upload_to_ena(
file_paths=data_file_paths,
user_credentials=user_credentials,
submission_url=urls["ENA"]["DATA-SUBMISSION"],
file_transfer=file_transfer,
)
# Submit to Biosamples
biosamples_result = submit_to_biosamples(
isa_json=isa_json,
Expand Down Expand Up @@ -202,6 +218,26 @@ def submit_to_ena(
return result


def upload_to_ena(
file_paths: List[Path],
user_credentials: dict[str, str],
submission_url: str,
file_transfer: str,
):
ALLOWED_FILE_TRANSFER_SOLUTIONS = {"ftp", "aspera"}
file_transfer = file_transfer.lower()

if file_transfer not in ALLOWED_FILE_TRANSFER_SOLUTIONS:
raise ValueError(f"Unsupported transfer protocol: {file_transfer}")
if file_transfer == "ftp":
uploader = FTPUploader(
submission_url,
user_credentials["username"],
user_credentials["password"],
)
uploader.upload(file_paths)


def create_external_references(
biosamples_credentials: dict[str, str],
biosamples_externalReferences: dict[str, Any],
Expand Down
1 change: 1 addition & 0 deletions mars-cli/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ jsonschema
keyring
pydantic
click
retry
22 changes: 22 additions & 0 deletions mars-cli/tests/test_ftp_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import json

import pytest
from pathlib import Path
import ftplib

from mars_lib.ftp_upload import FTPUploader


def test_upload_login_failure():
uploader = FTPUploader("webin2.ebi.ac.uk", "junk", "more junk")
with pytest.raises(ftplib.error_perm, match="530 Login incorrect."):
uploader.upload([Path("./tests/fixtures/not_a_json_file.txt")])


@pytest.mark.skip(reason="Relies on real ENA credentials in test_credentials_example.json")
def test_upload_success():
# For local testing, add ENA username/password to test_credentials_example.json
with open("./tests/test_credentials_example.json") as f:
creds = json.load(f)
uploader = FTPUploader("webin2.ebi.ac.uk", creds["username"], creds["password"])
uploader.upload([Path("../test-data/ENA_TEST2.R1.fastq.gz"), Path("./tests/fixtures/not_a_json_file.txt")])
59 changes: 52 additions & 7 deletions repository-services/repository-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
This document is to define the interface between the broker and the target repository services.
This applies to all repositories, including BioSamples.

At present, only a single API endpoint is required, `submit`. Authentication and data transfer are not covered in this document, but some assumptions are laid out below.
There is one required endpoint, `submit`, as well as a submission status endpoint recommended for long-running submission processing. Authentication and data transfer are not covered in this document, but some assumptions are laid out below.

## Authentication
If the repository requires authentication to submit data, the submit endpoint must allow authentication via an authorization header.
Expand Down Expand Up @@ -35,24 +35,33 @@ The response must be JSON in the following format:
"errors": [
// error objects
],
"status": {
// status object
},
"info": [
// info objects
]
}
```
where:
* `targetRepository` is the identifier used to annotate the ISA-JSON and should take values from [identifiers.org](http://identifiers.org/)
* Either [`accessions`](#accession-object) OR [`errors`](#error-object), but not both, must be present as a list of objects of the form described below. Presence of this field indicates whether the submission was a success or a failure.
* (optional) [`info`](#info-object) is a list of objects of the form described below. This allows additional repository-specific information to be returned in the response.
* Exactly one of the following:
* `accessions`: list of objects defined [here](#accession-object)
* `errors`: list of objects defined [here](#error-object)
* `status`: object defined [here](#status-object)
* Presence of `accession`, `errors`, or `status` indicates whether the submission was a success, failure, or is still pending (asynchronous response).
* (optional) `info` is a list of objects of the form described [below](#info-object). This allows additional repository-specific information to be returned in the response.

This object is frequently referred to as the "receipt" or the "MARS receipt".

#### Accession object
The accession object looks like the following:
```jsonc
{
"path": [
{"key": "studies", "where": {"key": "X", "value": "Y"}},
{"key": "materials"}
// further path objects as needed
{"key": "studies", "where": {"key": "X", "value": "Y"}},
{"key": "materials"}
// further path objects as needed
],
"value": "REPO_123"
}
Expand Down Expand Up @@ -88,6 +97,20 @@ The error objects being returned by the repository may be used by developers to

Besides this error reporting, the service should employ other HTTP error codes as usual (e.g. 401).

#### Status object
The status object looks like the following:
```jsonc
{
"statusUrl": "...",
"id": "...",
"percentComplete": 0.25,
}
```
where:
* `statusUrl` is a URL that can be queried to determine the completion status of the submission (see [status endpoint](#submission-status-endpoint) section below)
* (optional) `id` is an identifier for the submission
* (optional) `percentComplete` is a number between 0 and 1 indicating the approximate percentage of the processing by the repository that is complete

#### Info object
The info object looks like the following:
```jsonc
Expand All @@ -96,10 +119,21 @@ The info object looks like the following:
"message": "..."
}
```
where `name` and `message` are strings at the repository’s discretion.
where `name` (optional) and `message` are strings at the repository’s discretion.

This can be used to provide any additional information back to the user, not relating to accessions or errors. For example, it could include the submission date and when the data will be made public. This will not be processed further by the broker but will only be presented to the user.

## Submission status endpoint
`GET /{submission_id}/status`

(The endpoint path is only a suggestion, the actual path can differ as long as it is accurately returned in the `status` field of the receipt.)

This endpoint is used to poll for the status of a previous submission. It should be used whenever the time from data and metadata submission until the issuing of accessions exceeds a reasonable duration, and it must be returned in the `status` field of the receipt.

### Response

The response must be the same format as for the submit endpoint (i.e. the [MARS receipt](#response)), again indicating whether the submission is complete and successful, complete with errors, or still pending.

## Examples

### Submission request
Expand Down Expand Up @@ -219,3 +253,14 @@ For illustration only.
]
}
```

### Status response
```json
{
"targetRepository": "eva",
"status": {
"id": "123-456",
"statusUrl": "https://ebi.ac.uk/eva/submission/123-456/status"
}
}
```
Loading

0 comments on commit 94e7279

Please sign in to comment.