Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: error handling in autoreporting tool #70

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions Scripts/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
import tabix
from autoreporting_utils import *
import logging
#TODO: make a system for making sure we can calculate all necessary fields,
#e.g by checking that the columns exist

Expand Down Expand Up @@ -197,13 +198,20 @@ def annotate(df,gnomad_genome_path, gnomad_exome_path, batch_freq, finngen_path,
parser.add_argument("--annotate-out",dest="annotate_out",type=str,default="annotate_out.tsv",help="Output filename, default is out.tsv")
parser.add_argument("--column-labels",dest="column_labels",metavar=("CHROM","POS","REF","ALT","PVAL","BETA","AF"),nargs=7,default=["#chrom","pos","ref","alt","pval","beta","maf"],help="Names for data file columns. Default is '#chrom pos ref alt pval beta maf'.")
parser.add_argument("--finngen-annotation-version",dest="fg_ann_version",type=str,default="r3",help="Finngen annotation release version: 3 or under or 4 or higher? Allowed values: 'r3' and 'r4'. Default 'r3' ")
parser.add_argument("--loglevel",dest="loglevel",type=str,default="warning",help="Level at which events are logged. Use values info, debug, warning, error, critical" )

args=parser.parse_args()
columns=autoreporting_utils.columns_from_arguments(args.column_labels)
loglevel=getattr(logging, args.loglevel.upper() )
logging.basicConfig(level=loglevel)
log_arguments(args)

columns=columns_from_arguments(args.column_labels)
if args.prefix!="":
args.prefix=args.prefix+"."
args.annotate_out = "{}{}".format(args.prefix,args.annotate_out)
if (args.gnomad_exome_path == None) or (args.gnomad_genome_path == None) or (args.finngen_path==None):
print("Annotation files missing, aborting...")
logger=logging.getLogger(__name__)
logger.error("Annotation files missing, aborting...")
else:
input_df = pd.read_csv(args.annotate_fpath,sep="\t")
df = annotate(df=input_df,gnomad_genome_path=args.gnomad_genome_path, gnomad_exome_path=args.gnomad_exome_path, batch_freq=args.batch_freq, finngen_path=args.finngen_path,fg_ann_version=args.fg_ann_version,
Expand Down
10 changes: 9 additions & 1 deletion Scripts/autoreporting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from subprocess import Popen, PIPE
import pandas as pd, numpy as np
import tabix
import logging

"""
Utility functions that are used in the scripts, put here for keeping the code clearer
Expand Down Expand Up @@ -105,4 +106,11 @@ def columns_from_arguments(column_labels):
In: column labels, as a list
Out: Dictionary with the members 'chrom','pos','ref','alt','pval', 'beta', 'af'
"""
return {"chrom":column_labels[0],"pos":column_labels[1],"ref":column_labels[2],"alt":column_labels[3],"pval":column_labels[4],"beta":column_labels[5],"af":column_labels[6]}
return {"chrom":column_labels[0],"pos":column_labels[1],"ref":column_labels[2],"alt":column_labels[3],"pval":column_labels[4],"beta":column_labels[5],"af":column_labels[6]}

def log_arguments(args):
logger= logging.getLogger(__name__)
arguments=vars(args)
logger.info("Used arguments:")
for key, value in arguments.items():
logger.info(" {}: {}".format(key,value))
25 changes: 18 additions & 7 deletions Scripts/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import gwcatalog_api
import os
from multiprocessing.dummy import Pool as ThreadPool
import logging

def map_alleles(a1,a2):
"""
Expand Down Expand Up @@ -130,12 +131,14 @@ def create_top_level_report(report_df,efo_traits,columns,grouping_method,signifi
row["chr"]=loc_variants[columns["chrom"]].iat[0]
row["start"]=np.amin(loc_variants[columns["pos"]])
row["end"]=np.amax(loc_variants[columns["pos"]])
logger=logging.getLogger(__name__)
# Add annotation info. Try because it is possible that annotation step was skipped.
try:
enrichment=loc_variants.loc[loc_variants["#variant"]==locus_id,"GENOME_FI_enrichment_nfe_est"].iat[0]
most_severe_gene=loc_variants.loc[loc_variants["#variant"]==locus_id,"most_severe_gene"].iat[0]
most_severe_consequence=loc_variants.loc[loc_variants["#variant"]==locus_id,"most_severe_consequence"].iat[0]
except:
logger.warning("Annotation information not available for top report. Those values will not be present in the final top report.")
enrichment=""
most_severe_gene=""
most_severe_consequence=""
Expand Down Expand Up @@ -341,6 +344,7 @@ def compare(df, compare_style, summary_fpath, endpoints, ld_check, plink_mem, ld
ld_df (optional): a dataframe containing the LD paired associations of variants

"""
logger=logging.getLogger(__name__)
if df.empty:
#print("No variants, {} and {} will not be produced".format(report_out, ld_report_out))
return (None, None)
Expand All @@ -357,7 +361,7 @@ def compare(df, compare_style, summary_fpath, endpoints, ld_check, plink_mem, ld
gwas_df=None
gwapi=None
if os.path.exists("{}gwas_out_mapping.tsv".format(prefix)) and cache_gwas:
print("reading gwas results from gwas_out_mapping.tsv...")
logger.info("reading gwas results from gwas_out_mapping.tsv...")
gwas_df=pd.read_csv("{}gwas_out_mapping.tsv".format(prefix),sep="\t")
gwapi=gwcatalog_api.GwasApi()
else:
Expand Down Expand Up @@ -399,7 +403,7 @@ def compare(df, compare_style, summary_fpath, endpoints, ld_check, plink_mem, ld
summary_df=pd.concat([summary_df_1,summary_df_2],sort=True)
if summary_df.empty:
#just abort, output the top report but no merging summary df cause it doesn't exist
print("No summary variants, report will be incomplete")
logger.warning("No summary variants. Report will not have that information.")
report_out_df=df.copy()
report_out_df["#variant_hit"]="NA"
report_out_df["pval_trait"]="NA"
Expand All @@ -426,13 +430,13 @@ def compare(df, compare_style, summary_fpath, endpoints, ld_check, plink_mem, ld
#create chromosome list and group loci based on those
chrom_lst= sorted([*{*[s.split("_")[0].strip("chr") for s in unique_locus_list]}])
for chrom in chrom_lst:
print("------------LD for groups in chromosome {}------------".format(chrom))
logger.debug("------------LD for groups in chromosome {}------------".format(chrom))
groups=df[df[columns["chrom"]].astype(str) == chrom ].loc[:,"locus_id"].unique()
plink_cmd="plink --bfile {} --output-chr M --chr {} --make-bed --out {}temp_chrom --memory {}".format( ld_panel_path, chrom , prefix, plink_mem)
pr=subprocess.run(shlex.split(plink_cmd),stdout=PIPE,stderr=subprocess.STDOUT)
if pr.returncode!=0:
print("PLINK FAILURE for chromosome {}. Error code {}".format(chrom,pr.returncode) )
print(pr.stdout)
logger.warning("PLINK FAILURE for chromosome {}. Error code {}".format(chrom,pr.returncode) )
logger.warning(pr.stdout)
continue
for gr in groups:
ld=extract_ld_variants(df,summary_df,gr,ldstore_threads,ld_treshold,prefix,columns)
Expand All @@ -448,7 +452,7 @@ def compare(df, compare_style, summary_fpath, endpoints, ld_check, plink_mem, ld
ld_out=df.merge(ld_df,how="inner",left_on="#variant",right_on="RSID1")
ld_out=ld_out.drop(columns=["RSID1","map_variant","RSID2_map"]).rename(columns={"{}_x".format(columns["pval"]):columns["pval"],"{}_y".format(columns["pval"]):"pval_trait","RSID2":"#variant_hit"})
else:
print("No variants in ld found, no LD output file produced.")
logger.warning("No variants in ld found, no LD output file produced.")
return (report_out_df, ld_out)

if __name__ == "__main__":
Expand Down Expand Up @@ -478,8 +482,15 @@ def compare(df, compare_style, summary_fpath, endpoints, ld_check, plink_mem, ld
parser.add_argument("--efo-codes",dest="efo_traits",type=str,nargs="+",default=[],help="Specific EFO codes to look for in the top level report")
parser.add_argument("--local-gwascatalog",dest='localdb_path',type=str,help="Path to local GWAS Catalog DB.")
parser.add_argument("--db",dest="database_choice",type=str,default="gwas",help="Database to use for comparison. use 'local','gwas' or 'summary_stats'.")
parser.add_argument("--loglevel",dest="loglevel",type=str,default="warning",help="Level at which events are logged. Use values info, debug, warning, error, critical" )

args=parser.parse_args()
columns=autoreporting_utils.columns_from_arguments(args.column_labels)
loglevel=getattr(logging, args.loglevel.upper() )
logging.basicConfig(level=loglevel)
log_arguments(args)

columns=columns_from_arguments(args.column_labels)

if args.prefix!="":
args.prefix=args.prefix+"."
args.report_out = "{}{}".format(args.prefix,args.report_out)
Expand Down
28 changes: 23 additions & 5 deletions Scripts/gws_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import tabix
from autoreporting_utils import *
from linkage import PlinkLD, OnlineLD
import logging

def parse_region(region):
chrom=region.split(":")[0]
Expand Down Expand Up @@ -165,9 +166,19 @@ def get_gws_variants(fname, sign_treshold=5e-8,dtype=None,columns={"chrom":"#chr
columns["beta"]:np.float64,
columns["af"]:np.float64}
retval=pd.DataFrame()
for df in pd.read_csv(fname,compression=compression,sep="\t",dtype=dtype,engine="c",chunksize=chunksize):
retval=pd.concat( [retval,df.loc[df[columns["pval"] ] <=sign_treshold,: ] ], axis="index", ignore_index=True )
retval=retval[ [ columns["chrom"],columns["pos"],columns["ref"],columns["alt"],columns["pval"], columns["beta"],columns["af"] ] ]
try:
for df in pd.read_csv(fname,compression=compression,sep="\t",dtype=dtype,engine="c",chunksize=chunksize):
retval=pd.concat( [retval,df.loc[df[columns["pval"] ] <=sign_treshold,: ] ], axis="index", ignore_index=True,sort=False )
retval=retval[ [ columns["chrom"],columns["pos"],columns["ref"],columns["alt"],columns["pval"], columns["beta"],columns["af"] ] ]
except KeyError:
logger=logging.getLogger(__name__)
cols=list(df.columns)
columns_not_in_cols=[a for a in columns.values() if a not in cols]
logger.error("A KeyError happened while parsing the input file {}. It is likely that the columns given to the script do not match with actual input file columns.".format(fname))
logger.error("Input file columns:{}".format(cols))
logger.error("Columns given to script:{}".format(list(columns.values())))
logger.error("Columns not in input file columns:{}".format(columns_not_in_cols))
raise
return retval

def merge_credset(gws_df,cs_df,fname,columns):
Expand Down Expand Up @@ -215,7 +226,8 @@ def fetch_gws(gws_fpath, sig_tresh_1,prefix,group,grouping_method,locus_width,si
temp_df=temp_df.loc[~ign_idx,:]

if temp_df.empty:
print("The input file {} contains no gws-significant hits with signifigance treshold of {}. Aborting.".format(gws_fpath,sig_tresh_1))
logger=logging.getLogger(__name__)
logger.warning("The input file {} contains no gws-significant hits with signifigance treshold of {}. Aborting.".format(gws_fpath,sig_tresh_1))
return None

#data input: get credible set variants
Expand Down Expand Up @@ -279,8 +291,14 @@ def fetch_gws(gws_fpath, sig_tresh_1,prefix,group,grouping_method,locus_width,si
parser.add_argument("--ignore-region",dest="ignore_region",type=str,default="",help="Ignore the given region, e.g. HLA region, from analysis. Give in CHROM:BPSTART-BPEND format.")
parser.add_argument("--credible-set-file",dest="cred_set_file",type=str,default="",help="bgzipped SuSiE credible set file.")
parser.add_argument("--ld-api",dest="ld_api_choice",type=str,default="plink",help="LD interface to use. Valid options are 'plink' and 'online'.")
parser.add_argument("--loglevel",dest="loglevel",type=str,default="warning",help="Level at which events are logged. Use values info, debug, warning, error, critical" )

args=parser.parse_args()
columns=autoreporting_utils.columns_from_arguments(args.column_labels)
loglevel=getattr(logging, args.loglevel.upper() )
logging.basicConfig(level=loglevel)
log_arguments(args)

columns=columns_from_arguments(args.column_labels)
if args.prefix!="":
args.prefix=args.prefix+"."
args.fetch_out = "{}{}".format(args.prefix,args.fetch_out)
Expand Down
38 changes: 29 additions & 9 deletions Scripts/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
#!/usr/bin/env python3

import argparse,shlex,subprocess
import argparse,shlex,subprocess,traceback,sys
import pandas as pd
import numpy as np
import gws_fetch, compare, annotate,autoreporting_utils
from linkage import PlinkLD, OnlineLD
import logging


def main(args):
print("input file: {}".format(args.gws_fpath))
logger= logging.getLogger(__name__)
autoreporting_utils.log_arguments(args)
#print()
args.fetch_out = "{}{}".format(args.prefix,args.fetch_out)
args.annotate_out = "{}{}".format(args.prefix,args.annotate_out)
args.report_out = "{}{}".format(args.prefix,args.report_out)
Expand All @@ -22,17 +26,17 @@ def main(args):
elif args.ld_api_choice == "online":
ld_api = OnlineLD(url="http://api.finngen.fi/api/ld")
else:
logger.critical("Wrong argument for --ld-api:{}".format(args.ld_api_choice))
raise ValueError("Wrong argument for --ld-api:{}".format(args.ld_api_choice))
###########################
###Filter and Group SNPs###
###########################
print("filter & group SNPs")
logger.info("filter & group SNPs")
args.annotate_fpath=args.fetch_out
args.compare_fname=args.annotate_out
fetch_df = gws_fetch.fetch_gws(gws_fpath=args.gws_fpath, sig_tresh_1=args.sig_treshold, prefix=args.prefix, group=args.grouping, grouping_method=args.grouping_method, locus_width=args.loc_width,
sig_tresh_2=args.sig_treshold_2, ld_r2=args.ld_r2, overlap=args.overlap, columns=columns,
ignore_region=args.ignore_region, cred_set_file=args.cred_set_file,ld_api=ld_api)

#write fetch_df as a file, so that other parts of the script work
fetch_df.to_csv(path_or_buf=args.fetch_out,sep="\t",index=False,float_format="%.3g")
###########################
Expand All @@ -42,20 +46,20 @@ def main(args):
###########################
#######Annotate SNPs#######
###########################
if (args.gnomad_exome_path == None) or (args.gnomad_genome_path == None) or (args.finngen_path==None):
print("Annotation files missing, skipping gnomad & finngen annotation...")
if (args.gnomad_exome_path == None) or (args.gnomad_genome_path == None) or (args.finngen_path==None) or (args.functional_path==None):
logger.info("Annotation files missing, skipping gnomad & finngen & functional annotation...")
#args.compare_fname=args.annotate_fpath
annotate_df = fetch_df
else:
print("Annotate SNPs")
logger.info("Annotate SNPs")
#annotate_df = annotate.annotate(fetch_df,args)
annotate_df = annotate.annotate(df=fetch_df,gnomad_genome_path=args.gnomad_genome_path, gnomad_exome_path=args.gnomad_exome_path, batch_freq=args.batch_freq, finngen_path=args.finngen_path, fg_ann_version = args.fg_ann_version,
functional_path=args.functional_path, prefix=args.prefix, columns=columns)
annotate_df.to_csv(path_or_buf=args.annotate_out,sep="\t",index=False,float_format="%.3g")
###########################
######Compare results######
###########################
print("Compare results to previous findings")
logger.info("Compare results to previous findings")
[report_df,ld_out_df] = compare.compare(annotate_df,compare_style=args.compare_style, summary_fpath=args.summary_fpath, endpoints=args.endpoints,ld_check=args.ld_check,
plink_mem=args.plink_mem, ld_panel_path=args.ld_panel_path, prefix=args.prefix,
gwascatalog_pval=args.gwascatalog_pval, gwascatalog_pad=args.gwascatalog_pad, gwascatalog_threads=args.gwascatalog_threads,
Expand Down Expand Up @@ -119,7 +123,23 @@ def main(args):
parser.add_argument("--efo-codes",dest="efo_traits",type=str,nargs="+",default=[],help="Specific EFO codes to look for in the top level report")
parser.add_argument("--local-gwascatalog",dest='localdb_path',type=str,help="Path to local GWAS Catalog DB.")
parser.add_argument("--db",dest="database_choice",type=str,default="gwas",help="Database to use for comparison. use 'local','gwas' or 'summary_stats'.")

#other
parser.add_argument("--loglevel",dest="loglevel",type=str,default="warning",help="Level at which events are logged. Use values info, debug, warning, error, critical" )
args=parser.parse_args()
loglevel=getattr(logging, args.loglevel.upper() )
logging.basicConfig(level=loglevel)
if args.prefix!="":
args.prefix=args.prefix+"."
main(args)
try:
main(args)
except Exception as e:
logger= logging.getLogger(__name__)
logger.exception("Exception occurred. The stack trace has been captured. The tool will now abort.")
#fname=die()
#print("Traceback printed in file {}".format(fname))
sys.exit(1)
else:
sys.exit(0)