Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: load-qebil-downloaded-studies #3112

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 211 additions & 0 deletions scripts/qiita-load-qebil-downloads
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
#!/usr/bin/env python
# -----------------------------------------------------------------------------
# Copyright (c) 2014--, The Qiita Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

from time import sleep
from glob import glob
from os.path import isdir, basename, join
from shutil import copyfile

from qiita_db.study import Study
from qiita_db.artifact import Artifact
from qiita_db.commands import (
load_study_from_cmd, load_sample_template_from_cmd,
load_prep_template_from_cmd)
from qiita_db.util import get_data_types, get_mountpoint


SLEEP_TIME = 10
EBIDIR = '/panfs/panfs1.ucsd.edu/panscratch/qiita/qebil/vertebrates/'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vertebrates?

data_types = set([x.replace(' ', '_') for x in get_data_types()])

for folder in glob(f'{EBIDIR}/*'):
warnings = []
extra_notes = dict()
if not isdir(folder):
print(f'Ignoring: {folder}')
continue
# note necessry but nice for debugging
print(f'\n\n\n+++> Processing {folder}, you have {SLEEP_TIME} '
'seconds to ctrl-c')
sleep(10)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the sleep for?


files = glob(f'{folder}/*')
files_used = []
qebil_status_fp = [f for f in files if f.endswith('qebil_status')][0]
with open(qebil_status_fp, 'r') as fp:
qebil_status = fp.readlines()[0]
Comment on lines +31 to +33
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is qebil being executed? The filebased process communication may represent a pain point in the future.

if 'complete' not in qebil_status:
print(f'Skipping {qebil_status_fp}, not ready: {qebil_status}')
continue
files_used.append(qebil_status_fp)

title_fp = [f for f in files if f.endswith('_study_title.txt')][0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be erroneous if there were zero _study_title.txt files, or more than 1?

files_used.append(title_fp)
config_fp = [f for f in files if f.endswith('_study_config.txt')][0]
files_used.append(config_fp)
sample_fp = [f for f in files if f.endswith('_sample_info.tsv')][0]
files_used.append(sample_fp)

with open(title_fp, 'r') as fp:
title = fp.readlines()[0]

if Study.exists(title):
print(f'======> {folder}: {title} already loaded')
continue

with open(config_fp, 'r') as fp:
study = load_study_from_cmd('[email protected]', title, fp)

study.autoloaded = True
study.ebi_study_accession = study.info['study_alias'].split(';')[0]
st = study.sample_template
sample_info = load_sample_template_from_cmd(sample_fp, study.id)
st.ebi_sample_accessions = st.get_category('secondary_sample_accession')
st.biosample_accessions = st.get_category('sample_accession')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, would it be valuable to test if the accessions were observed already in qiita in addition to the study title? although, I'm not sure what action it would suggest if they were :/


preps = dict()
for f in files:
if '_prep_info_' not in f:
if (f.endswith('.log') or f.endswith('.EBI_metadata.tsv') or
f.endswith('.QIIME_mapping_file.tsv')):
files_used.append(f)
continue

if 'MISSING' in f or 'TOOMANYREADS' in f:
warnings.append(f'Skipping: {f}')
if 'MISSING' in f and 'MISSING' not in extra_notes:
extra_notes['MISSING'] = (
'One or more of the fastq files for your study were '
'unavailable for download from EBI/ENA or the downloaded '
'files were found to contain corrupt data and were '
'excluded from our automatic association and processing. '
'A list of the affected samples and their corresponding '
'EBI/ENA ftp links can be found in the .MISSING '
'preparation information files in the Uploads section of '
'this page. If you would like to attempt to manually '
'download and/or correct the fastq files, please visit '
'the linked EBI/ENA project page in the Study details and '
'follow our instructions for <a href="https://qiita.ucsd.'
'edu/static/doc/html/gettingstartedguide/index.html#'
'attaching-the-sample-information-to-the-study" '
'target="_blank">manually associating and processing the '
'files</a>.')
elif 'TOOMANYREADS' not in extra_notes:
extra_notes['TOOMANYREADS'] = (
'One or more of the fastq files for your study were found '
'to contain more read files than indicated by the single '
'or paired-end read technology that EBI/ENA indicated was '
'used for processing the sample. This is most likely the '
'case for studies where index reads have been included in '
'a separate file as part of the upload, however our '
'automated system is unable to readily distinguish this. '
'A list of the affected samples and their corresponding '
'EBI/ENA ftp links can be found in the .TOOMANYREADS '
'preparation information files in the Uploads section of '
'this page. If you would like to attempt to have these '
'samples processed, please visit the linked EBI/ENA '
'project page in the Study details and either a) follow '
'our instructions for <a href="https://qiita.ucsd.'
'edu/static/doc/html/gettingstartedguide/index.html#'
'attaching-the-sample-information-to-the-study" '
'target="_blank">manually associating and processing the '
'files</a>. or b) email Qiita Help to indicate that the '
'study should be processed with the assumption that the '
'first file associated with a samples is an index read '
'file.')
continue
added = False
for dt in data_types:
if f'{dt}' in f:
if dt not in preps:
preps[dt] = []
preps[dt].append(f)
added = True
files_used.append(f)
break
if not added:
warnings.append(f'Not supported: {f}')

if not preps:
warnings.append('No valid preparations found')

for dt, ptfps in preps.items():
dt = dt.replace('_', ' ')
print(f'==> Processing {dt}')
for ptfp in ptfps:
print(f' {ptfp}')
files_used.append(ptfp)
pt = load_prep_template_from_cmd(ptfp, study.id, dt)
pt.ebi_experiment_accessions = pt.get_category(
'experiment_accession')
pt.ebi_run_accessions = pt.get_category('run_accession')

library_layout = set(pt.get_category('library_layout').values())

run_prefixes = pt.get_category('run_prefix').values()

if len(run_prefixes) != len(set(run_prefixes)):
warnings.append(
f'Run prefixes are not unique; prep-id: {pt.id}')
continue

filepaths = []
for rp in run_prefixes:
matches = sorted([f for f in files if rp in f])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be if f.startswith(rp)?

if library_layout == {'PAIRED'}:
if len(matches) != 2:
warnings.append(f"{pt.id}: {rp} doesn't match PAIRED "
"library layout")
continue
filepaths.append((matches[0], 1))
filepaths.append((matches[1], 2))
elif library_layout == {'SINGLE'}:
if len(matches) != 1:
warnings.append(f"{pt.id}: {rp} doesn't match SINGLE "
"library layout")
continue
filepaths.append((matches[0], 1))
else:
warnings.append('Unknown library layout: '
f'{library_layout}; prep-id: {pt.id}')
files_used.extend([x for x, _ in filepaths])

lfp = len(filepaths)
lrp = len(run_prefixes)
if library_layout == {'PAIRED'} and lfp != lrp*2:
warnings.append('Not a valid number of files/run_prefixes '
f'({lfp}/{lrp}) for "PAIRED"; prep-id: '
f'{pt.id}')
continue
elif library_layout == {'SINGLE'} and lfp != lrp:
warnings.append('Not a valid number of files/run_prefixes '
f'({lfp}/{lrp}) for "SINGLE"; prep-id: '
f'{pt.id}')
continue

artifact = Artifact.create(filepaths, 'per_sample_FASTQ',
prep_template=pt, move_files=False)
notes = ''
if warnings:
notes = '<b>Warnings</b>:<ol>%s</ol>\n' % ''.join(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

html formatting here feeeeels like it should be done at the visualization layer on template render, but I would understand why doing so here is pragmatic

[f'<li>{x}</li>' for x in warnings])
missing_files = [x for x in set(files) - set(files_used)]
if missing_files:
uploads_fp = join(get_mountpoint("uploads")[0][1], str(study.id))
notes = f'{notes}<b>Extra files:</b><ul>'
for mf in missing_files:
copyfile(mf, uploads_fp)
notes = f'{notes}<li>%s</li>' % basename(mf)
notes = f'{notes}</ul>'
if extra_notes:
notes = f'{notes}<b>Extra Notes:</b><ul>%s</ul>' % ''.join(
[f'<li>{x}</li>' for x in extra_notes.values()])

if notes:
study.notes = notes