Skip to content

Commit

Permalink
Adjusted filter_mutations.py to make blacklist and whitelist optional
Browse files Browse the repository at this point in the history
  • Loading branch information
RaqManzano committed Sep 6, 2023
1 parent df59be1 commit dd3d0ba
Showing 1 changed file with 35 additions and 31 deletions.
66 changes: 35 additions & 31 deletions bin/filter_mutations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3
"""
Date: 13 Feb 2023
Author: @RaqManzano
Script: Filter variants from a MAF file producing another MAF file with the new filters added.
"""
Expand Down Expand Up @@ -89,7 +88,8 @@ def remove_ig_and_pseudo(maf):
"""
# fill na values
maf[['BIOTYPE', 'SYMBOL']] = maf[['BIOTYPE', 'SYMBOL']].fillna(value="")
maf["ig_pseudo"] = maf["BIOTYPE"].str.contains('IG_C_gene|IG_D_gene|IG_J_gene|IG_V_gene|TR_C_gene|TR_J_gene|TR_V_gene|pseudogene')
maf["ig_pseudo"] = maf["BIOTYPE"].str.contains(
'IG_C_gene|IG_D_gene|IG_J_gene|IG_V_gene|TR_C_gene|TR_J_gene|TR_V_gene|pseudogene')
return maf


Expand Down Expand Up @@ -132,7 +132,6 @@ def add_context(chrom, pos, ref, genome, flank=10):
return context



def remove_homopolymers(maf, ref):
"""
Check for variants in homopolymer regions (a sequence of 6 consecutive identical bases)
Expand Down Expand Up @@ -177,16 +176,17 @@ def filtering(maf, gnomad_thr, whitelist, blacklist, filters):
"""
if "PASS" not in filters:
filters += ["PASS"] # a PASS is always allowed
maf["whitelist"] = maf['DNAchange'].isin(whitelist) # whitelist
maf = remove_muts_in_range(df=maf, blacklist=blacklist) # blacklist
if whitelist:
maf["whitelist"] = maf['DNAchange'].isin(whitelist) # whitelist
if blacklist:
maf = remove_muts_in_range(df=maf, blacklist=blacklist) # blacklist
maf["ingnomAD"] = maf["MAX_AF"] >= gnomad_thr # gnomad

return maf



def add_ravex_filters(maf, filters, noncoding=False, homopolymer=False, ig_pseudo=False, min_alt_reads=2,
consensus=True):
blacklist=False,whitelist=False):
maf["RaVeX_FILTER"] = "PASS"
maf["Existing_variation"] = maf["Existing_variation"].fillna("")
maf["SOMATIC"] = maf["SOMATIC"].fillna("")
Expand All @@ -196,8 +196,9 @@ def add_ravex_filters(maf, filters, noncoding=False, homopolymer=False, ig_pseud
ravex_filter += ["min_alt_reads"]
if row["ingnomAD"]:
ravex_filter += ["gnomad"]
if row["blacklist"]:
ravex_filter += ["blacklist"]
if blacklist:
if row["blacklist"]:
ravex_filter += ["blacklist"]
if not noncoding:
if row["noncoding"]:
ravex_filter += ["noncoding"]
Expand All @@ -214,46 +215,43 @@ def add_ravex_filters(maf, filters, noncoding=False, homopolymer=False, ig_pseud
else:
if not row["FILTER_consensus"] in filters:
ravex_filter += ["vc_filter"]
if not ravex_filter or row["whitelist"]:
ravex_filter = ["PASS"]
if whitelist:
if not ravex_filter or row["whitelist"]:
ravex_filter = ["PASS"]
else:
if not ravex_filter:
ravex_filter = ["PASS"]
ravex_filter = ";".join(ravex_filter)
maf.at[idx, "RaVeX_FILTER"] = ravex_filter
return maf


def dedup_maf(dnachange, maf):
"""If more than one caller, then we need to dedup the entries. We select according to the caller we trust more:
mutect2, strelka, sage, others for SNVs, strelka, mutect2, sage for indels
"""If more than one caller, then we need to dedup the entries. We select according to this caller list:
mutect2, sage, strelka
TODO: this is too slow, need to think on a better implementation
"""
maf_variant = maf[maf["DNAchange"] == dnachange]

if maf_variant.iloc[0]["Variant_Type"] == "SNP":
maf_variant_dedup = maf_variant[maf_variant["Caller"].str.contains("mutect")]
maf_variant_dedup = maf_variant[maf_variant["Caller"].str.contains("mutect")]
if maf_variant_dedup.empty:
maf_variant_dedup = maf_variant[maf_variant["Caller"].str.contains("sage")]
if maf_variant_dedup.empty:
maf_variant_dedup = maf_variant[maf_variant["Caller"].str.contains("strelka")]
if maf_variant_dedup.empty:
maf_variant_dedup = maf_variant[maf_variant["Caller"].str.contains("sage")]
if maf_variant_dedup.empty:
maf_variant_dedup = maf_variant.drop_duplicates(subset='DNAchange', keep="first")
else:
maf_variant_dedup = maf_variant[maf_variant["Caller"].str.contains("strelka")]
if maf_variant_dedup.empty:
maf_variant_dedup = maf_variant[maf_variant["Caller"].str.contains("mutect")]
if maf_variant_dedup.empty:
maf_variant_dedup = maf_variant[maf_variant["Caller"].str.contains("sage")]
if maf_variant_dedup.empty:
maf_variant_dedup = maf_variant.drop_duplicates(subset='DNAchange', keep="first")
maf_variant_dedup = maf_variant.drop_duplicates(subset='DNAchange', keep="first")
return maf_variant_dedup


def write_maf(maf_df, mafin_file, mafout_file, vc_priority=["mutect2", "strelka", 'sage', 'consensus']):
def write_maf(maf_df, mafin_file, mafout_file, vc_priority=["mutect2", 'sage', "strelka"]):
"""Write output"""
header_lines = subprocess.getoutput(f"zgrep -Eh '#|Hugo_Symbol' {mafin_file} 2>/dev/null")
print("Removing duplicated variants from maf (only one entry from a caller will be kept)")
maf_dedup = []
all_callers = maf_df["Caller"].unique()
vc_priority = vc_priority + [x for x in all_callers if x not in vc_priority]
for caller in vc_priority:
maf_dedup += [maf_df[maf_df["Caller"]==caller]]
maf_dedup += [maf_df[maf_df["Caller"] == caller]]
maf_dedup = pd.concat(maf_dedup).drop_duplicates(subset='DNAchange', keep="first")
with open(mafout_file, "w") as mafout:
mafout.write(header_lines)
Expand All @@ -268,9 +266,15 @@ def main():
'IGR', 'INTRON', 'RNA']
args = argparser()
maf = read_maf(args.input)
whitelist = False
blacklist = False
if args.whitelist:
whitelist = read_whitelist_bed(args.whitelist)
if args.blacklist:
blacklist = read_blacklist_bed(args.blacklist)
maf = filtering(maf=maf, gnomad_thr=args.gnomad_thr,
whitelist=read_whitelist_bed(args.whitelist),
blacklist=read_blacklist_bed(args.blacklist),
whitelist=whitelist,
blacklist=blacklist,
filters=args.filters)
# tag noncoding
maf = noncoding(maf=maf, noncoding=noncoding_list)
Expand All @@ -279,7 +283,7 @@ def main():
# tag homopolymers
maf = remove_homopolymers(maf=maf, ref=args.ref)
# tag consensus
maf = add_ravex_filters(maf=maf, filters=args.filters)
maf = add_ravex_filters(maf=maf, filters=args.filters, blacklist=blacklist, whitelist=whitelist)
if not args.output:
args.output = args.input.replace(".maf", "filtered.maf")
write_maf(maf_df=maf, mafin_file=args.input, mafout_file=args.output)
Expand Down

0 comments on commit dd3d0ba

Please sign in to comment.