-
Notifications
You must be signed in to change notification settings - Fork 80
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: load-qebil-downloaded-studies #3112
base: dev
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
#!/usr/bin/env python | ||
# ----------------------------------------------------------------------------- | ||
# Copyright (c) 2014--, The Qiita Development Team. | ||
# | ||
# Distributed under the terms of the BSD 3-clause License. | ||
# | ||
# The full license is in the file LICENSE, distributed with this software. | ||
# ----------------------------------------------------------------------------- | ||
|
||
from time import sleep | ||
from glob import glob | ||
from os.path import isdir, basename, join | ||
from shutil import copyfile | ||
|
||
from qiita_db.study import Study | ||
from qiita_db.artifact import Artifact | ||
from qiita_db.commands import ( | ||
load_study_from_cmd, load_sample_template_from_cmd, | ||
load_prep_template_from_cmd) | ||
from qiita_db.util import get_data_types, get_mountpoint | ||
|
||
|
||
SLEEP_TIME = 10 | ||
EBIDIR = '/panfs/panfs1.ucsd.edu/panscratch/qiita/qebil/vertebrates/' | ||
data_types = set([x.replace(' ', '_') for x in get_data_types()]) | ||
|
||
for folder in glob(f'{EBIDIR}/*'): | ||
warnings = [] | ||
extra_notes = dict() | ||
if not isdir(folder): | ||
print(f'Ignoring: {folder}') | ||
continue | ||
# note necessry but nice for debugging | ||
print(f'\n\n\n+++> Processing {folder}, you have {SLEEP_TIME} ' | ||
'seconds to ctrl-c') | ||
sleep(10) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the sleep for? |
||
|
||
files = glob(f'{folder}/*') | ||
files_used = [] | ||
qebil_status_fp = [f for f in files if f.endswith('qebil_status')][0] | ||
with open(qebil_status_fp, 'r') as fp: | ||
qebil_status = fp.readlines()[0] | ||
Comment on lines
+31
to
+33
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How is qebil being executed? The filebased process communication may represent a pain point in the future. |
||
if 'complete' not in qebil_status: | ||
print(f'Skipping {qebil_status_fp}, not ready: {qebil_status}') | ||
continue | ||
files_used.append(qebil_status_fp) | ||
|
||
title_fp = [f for f in files if f.endswith('_study_title.txt')][0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be erroneous if there were zero |
||
files_used.append(title_fp) | ||
config_fp = [f for f in files if f.endswith('_study_config.txt')][0] | ||
files_used.append(config_fp) | ||
sample_fp = [f for f in files if f.endswith('_sample_info.tsv')][0] | ||
files_used.append(sample_fp) | ||
|
||
with open(title_fp, 'r') as fp: | ||
title = fp.readlines()[0] | ||
|
||
if Study.exists(title): | ||
print(f'======> {folder}: {title} already loaded') | ||
continue | ||
|
||
with open(config_fp, 'r') as fp: | ||
study = load_study_from_cmd('[email protected]', title, fp) | ||
|
||
study.autoloaded = True | ||
study.ebi_study_accession = study.info['study_alias'].split(';')[0] | ||
st = study.sample_template | ||
sample_info = load_sample_template_from_cmd(sample_fp, study.id) | ||
st.ebi_sample_accessions = st.get_category('secondary_sample_accession') | ||
st.biosample_accessions = st.get_category('sample_accession') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Out of curiosity, would it be valuable to test if the accessions were observed already in qiita in addition to the study title? although, I'm not sure what action it would suggest if they were :/ |
||
|
||
preps = dict() | ||
for f in files: | ||
if '_prep_info_' not in f: | ||
if (f.endswith('.log') or f.endswith('.EBI_metadata.tsv') or | ||
f.endswith('.QIIME_mapping_file.tsv')): | ||
files_used.append(f) | ||
continue | ||
|
||
if 'MISSING' in f or 'TOOMANYREADS' in f: | ||
warnings.append(f'Skipping: {f}') | ||
if 'MISSING' in f and 'MISSING' not in extra_notes: | ||
extra_notes['MISSING'] = ( | ||
'One or more of the fastq files for your study were ' | ||
'unavailable for download from EBI/ENA or the downloaded ' | ||
'files were found to contain corrupt data and were ' | ||
'excluded from our automatic association and processing. ' | ||
'A list of the affected samples and their corresponding ' | ||
'EBI/ENA ftp links can be found in the .MISSING ' | ||
'preparation information files in the Uploads section of ' | ||
'this page. If you would like to attempt to manually ' | ||
'download and/or correct the fastq files, please visit ' | ||
'the linked EBI/ENA project page in the Study details and ' | ||
'follow our instructions for <a href="https://qiita.ucsd.' | ||
'edu/static/doc/html/gettingstartedguide/index.html#' | ||
'attaching-the-sample-information-to-the-study" ' | ||
'target="_blank">manually associating and processing the ' | ||
'files</a>.') | ||
elif 'TOOMANYREADS' not in extra_notes: | ||
extra_notes['TOOMANYREADS'] = ( | ||
'One or more of the fastq files for your study were found ' | ||
'to contain more read files than indicated by the single ' | ||
'or paired-end read technology that EBI/ENA indicated was ' | ||
'used for processing the sample. This is most likely the ' | ||
'case for studies where index reads have been included in ' | ||
'a separate file as part of the upload, however our ' | ||
'automated system is unable to readily distinguish this. ' | ||
'A list of the affected samples and their corresponding ' | ||
'EBI/ENA ftp links can be found in the .TOOMANYREADS ' | ||
'preparation information files in the Uploads section of ' | ||
'this page. If you would like to attempt to have these ' | ||
'samples processed, please visit the linked EBI/ENA ' | ||
'project page in the Study details and either a) follow ' | ||
'our instructions for <a href="https://qiita.ucsd.' | ||
'edu/static/doc/html/gettingstartedguide/index.html#' | ||
'attaching-the-sample-information-to-the-study" ' | ||
'target="_blank">manually associating and processing the ' | ||
'files</a>. or b) email Qiita Help to indicate that the ' | ||
'study should be processed with the assumption that the ' | ||
'first file associated with a samples is an index read ' | ||
'file.') | ||
continue | ||
added = False | ||
for dt in data_types: | ||
if f'{dt}' in f: | ||
if dt not in preps: | ||
preps[dt] = [] | ||
preps[dt].append(f) | ||
added = True | ||
files_used.append(f) | ||
break | ||
if not added: | ||
warnings.append(f'Not supported: {f}') | ||
|
||
if not preps: | ||
warnings.append('No valid preparations found') | ||
|
||
for dt, ptfps in preps.items(): | ||
dt = dt.replace('_', ' ') | ||
print(f'==> Processing {dt}') | ||
for ptfp in ptfps: | ||
print(f' {ptfp}') | ||
files_used.append(ptfp) | ||
pt = load_prep_template_from_cmd(ptfp, study.id, dt) | ||
pt.ebi_experiment_accessions = pt.get_category( | ||
'experiment_accession') | ||
pt.ebi_run_accessions = pt.get_category('run_accession') | ||
|
||
library_layout = set(pt.get_category('library_layout').values()) | ||
|
||
run_prefixes = pt.get_category('run_prefix').values() | ||
|
||
if len(run_prefixes) != len(set(run_prefixes)): | ||
warnings.append( | ||
f'Run prefixes are not unique; prep-id: {pt.id}') | ||
continue | ||
|
||
filepaths = [] | ||
for rp in run_prefixes: | ||
matches = sorted([f for f in files if rp in f]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be |
||
if library_layout == {'PAIRED'}: | ||
if len(matches) != 2: | ||
warnings.append(f"{pt.id}: {rp} doesn't match PAIRED " | ||
"library layout") | ||
continue | ||
filepaths.append((matches[0], 1)) | ||
filepaths.append((matches[1], 2)) | ||
elif library_layout == {'SINGLE'}: | ||
if len(matches) != 1: | ||
warnings.append(f"{pt.id}: {rp} doesn't match SINGLE " | ||
"library layout") | ||
continue | ||
filepaths.append((matches[0], 1)) | ||
else: | ||
warnings.append('Unknown library layout: ' | ||
f'{library_layout}; prep-id: {pt.id}') | ||
files_used.extend([x for x, _ in filepaths]) | ||
|
||
lfp = len(filepaths) | ||
lrp = len(run_prefixes) | ||
if library_layout == {'PAIRED'} and lfp != lrp*2: | ||
warnings.append('Not a valid number of files/run_prefixes ' | ||
f'({lfp}/{lrp}) for "PAIRED"; prep-id: ' | ||
f'{pt.id}') | ||
continue | ||
elif library_layout == {'SINGLE'} and lfp != lrp: | ||
warnings.append('Not a valid number of files/run_prefixes ' | ||
f'({lfp}/{lrp}) for "SINGLE"; prep-id: ' | ||
f'{pt.id}') | ||
continue | ||
|
||
artifact = Artifact.create(filepaths, 'per_sample_FASTQ', | ||
prep_template=pt, move_files=False) | ||
notes = '' | ||
if warnings: | ||
notes = '<b>Warnings</b>:<ol>%s</ol>\n' % ''.join( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. html formatting here feeeeels like it should be done at the visualization layer on template render, but I would understand why doing so here is pragmatic |
||
[f'<li>{x}</li>' for x in warnings]) | ||
missing_files = [x for x in set(files) - set(files_used)] | ||
if missing_files: | ||
uploads_fp = join(get_mountpoint("uploads")[0][1], str(study.id)) | ||
notes = f'{notes}<b>Extra files:</b><ul>' | ||
for mf in missing_files: | ||
copyfile(mf, uploads_fp) | ||
notes = f'{notes}<li>%s</li>' % basename(mf) | ||
notes = f'{notes}</ul>' | ||
if extra_notes: | ||
notes = f'{notes}<b>Extra Notes:</b><ul>%s</ul>' % ''.join( | ||
[f'<li>{x}</li>' for x in extra_notes.values()]) | ||
|
||
if notes: | ||
study.notes = notes |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vertebrates?