diff --git a/cre.gnomad_genome.conf b/cre.gnomad_genome.conf deleted file mode 100644 index 405b72a..0000000 --- a/cre.gnomad_genome.conf +++ /dev/null @@ -1,70 +0,0 @@ -[[annotation]] -file="variation/gnomad_genome.vcf.gz" -fields=["AF", "AC", "Hom"] -names=["Gnomad_af_gs", "Gnomad_ac_gs", "Gnomad_hom_gs"] -ops=["self", "self", "self"] - -#Possible Fields -#AC_AFR: "Allele count in African/African American genotypes, for each ALT allele, in the same order as listed" -#AC_AMR: "Allele count in Admixed American genotypes, for each ALT allele, in the same order as listed" -#AC_ASJ: "Allele count in Ashkenazi Jewish genotypes, for each ALT allele, in the same order as listed" -#AC_EAS: "Allele count in East Asian genotypes, for each ALT allele, in the same order as listed" -#AC_FIN: "Allele count in Finnish genotypes, for each ALT allele, in the same order as listed" -#AC_NFE: "Allele count in Non-Finnish European genotypes, for each ALT allele, in the same order as listed" -#AC_OTH: "Allele count in Other (population not assigned) genotypes, for each ALT allele, in the same order as listed" -#AC_Male: "Allele count in Male genotypes, for each ALT allele, in the same order as listed" -#AC_Female: "Allele count in Female genotypes, for each ALT allele, in the same order as listed" -#AN_AFR: "Total number of alleles in African/African American called genotypes" -#AN_AMR: "Total number of alleles in Admixed American called genotypes" -#AN_ASJ: "Total number of alleles in Ashkenazi Jewish called genotypes" -#AN_EAS: "Total number of alleles in East Asian called genotypes" -#AN_FIN: "Total number of alleles in Finnish called genotypes" -#AN_NFE: "Total number of alleles in Non-Finnish European called genotypes" -#AN_OTH: "Total number of alleles in Other (population not assigned) called genotypes" -#AN_Male: "Total number of alleles in Male called genotypes" -#AN_Female: "Total number of alleles in Female called genotypes" -#AF_AFR: "Allele Frequency among African/African American genotypes, for each ALT allele, in the same order as listed" -#AF_AMR: "Allele Frequency among Admixed American genotypes, for each ALT allele, in the same order as listed" -#AF_ASJ: "Allele Frequency among Ashkenazi Jewish genotypes, for each ALT allele, in the same order as listed" -#AF_EAS: "Allele Frequency among East Asian genotypes, for each ALT allele, in the same order as listed" -#AF_FIN: "Allele Frequency among Finnish genotypes, for each ALT allele, in the same order as listed" -#AF_NFE: "Allele Frequency among Non-Finnish European genotypes, for each ALT allele, in the same order as listed" -#AF_OTH: "Allele Frequency among Other (population not assigned) genotypes, for each ALT allele, in the same order as listed" -#AF_Male: "Allele Frequency among Male genotypes, for each ALT allele, in the same order as listed" -#AF_Female: "Allele Frequency among Female genotypes, for each ALT allele, in the same order as listed" -#GC_AFR: "Count of African/African American individuals for each genotype" -#GC_AMR: "Count of Admixed American individuals for each genotype" -#GC_ASJ: "Count of Ashkenazi Jewish individuals for each genotype" -#GC_EAS: "Count of East Asian individuals for each genotype" -#GC_FIN: "Count of Finnish individuals for each genotype" -#GC_NFE: "Count of Non-Finnish European individuals for each genotype" -#GC_OTH: "Count of Other (population not assigned) individuals for each genotype" -#GC_Male: "Count of Male individuals for each genotype" -#GC_Female: "Count of Female individuals for each genotype" -#AC_raw: "Allele counts before filtering low-confidence genotypes, for each ALT allele, in the same order as listed" -#AN_raw: "Total number of alleles before filtering low-confidence genotypes" -#AF_raw: "Allele frequency before filtering low-confidence genotypes, for each ALT allele, in the same order as listed" -#GC_raw: "Raw count of individuals for each genotype before filtering low-confidence genotypes" -#GC: "Count of individuals for each genotype" -#Hom_AFR: "Count of homozygous African/African American individuals" -#Hom_AMR: "Count of homozygous Admixed American individuals" -#Hom_ASJ: "Count of homozygous Ashkenazi Jewish individuals" -#Hom_EAS: "Count of homozygous East Asian individuals" -#Hom_FIN: "Count of homozygous Finnish individuals" -#Hom_NFE: "Count of homozygous Non-Finnish European individuals" -#Hom_OTH: "Count of homozygous Other (population not assigned) individuals" -#Hom_Male: "Count of homozygous Male individuals" -#Hom_Female: "Count of homozygous Female individuals" -#Hom_raw: "Count of homozygous individuals in raw genotypes before filtering low-confidence genotypes" -#Hom: "Count of homozygous individuals" -#STAR_AC: "AC of deletions spanning this position" -#STAR_AC_raw: "Allele counts of deletions spanning this position before filtering low-confidence genotypes" -#STAR_Hom: "Count of individuals homozygous for a deletion spanning this position" -#POPMAX: "Population with max AF" -#AC_POPMAX: "AC in the population with the max AF" -#AN_POPMAX: "AN in the population with the max AF" -#AF_POPMAX: "Maximum Allele Frequency across populations (excluding OTH)" -#DP_MEDIAN: "Median DP in carriers of each allele" -#DREF_MEDIAN,NumberA,TypeFloat,"Median dosage of homozygous reference in carriers of each allele" -#GQ_MEDIAN: "Median GQ in carriers of each allele" -#AB_MEDIAN: "Median allele balance in heterozygote carriers of each allele" \ No newline at end of file diff --git a/cre.vcfanno.conf b/cre.vcfanno.conf new file mode 100644 index 0000000..23d04df --- /dev/null +++ b/cre.vcfanno.conf @@ -0,0 +1,208 @@ +[[annotation]] +file="gnomad.exomes.r2.0.1.sites.no-VEP.nohist.tidy.vcf.gz" +fields=["AF", "AC", "Hom"] +names=["Gnomad_af_es", "Gnomad_ac_es", "Gnomad_hom_es"] +ops=["self", "self", "self"] + +[[annotation]] +file="variation/gnomad_genome.vcf.gz" +fields=["AF", "AC", "Hom"] +names=["Gnomad_af_gs", "Gnomad_ac_gs", "Gnomad_hom_gs"] +ops=["self", "self", "self"] + +[[annotation]] +file="ExAC.r0.3.sites.vep.tidy.vcf.gz" +fields = ["AF","AC","AC_Het", "AC_Hom"] + names = ["Exac_af","Exac_ac","Exac_ac_het", "Exac_hom"] +ops=["self","self","self","self"] + +[[annotation]] +file="ESP6500SI.all.snps_indels.tidy.v2.vcf.gz" +fields=["EA_AC", "AA_AC", "TAC"] +names=["af_esp_ea_float", "af_esp_aa_float", "af_esp_all_float"] +ops=["lua:ratio(vals)", "lua:ratio(vals)", "lua:ratio(vals)"] + + +[[annotation]] +file="dbsnp.b147.20160601.tidy.vcf.gz" +fields=["ID"] +names=["rs_ids"] +ops=["concat"] + +[[annotation]] +file="ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.tidy.vcf.gz" +fields=["AMR_AF", "EAS_AF", "SAS_AF", "AFR_AF", "EUR_AF", "AF"] +names=["af_1kg_amr", "af_1kg_eas", "af_1kg_sas", "af_1kg_afr", "af_1kg_eur", "af_1kg_all"] +ops=["max", "max", "max", "max", "max", "max"] + + +[[annotation]] +file="clinvar_20170130.tidy.vcf.gz" +fields=["CLNSIG", "CLNDBN"] +names=["clinvar_pathogenic", "clinvar_disease_name"] +ops=["self", "self"] + +# convert 5 to 'pathogenic', 255 to 'unknown', etc. +[[postannotation]] +fields=["clinvar_pathogenic"] +op="lua:clinvar_sig(clinvar_pathogenic)" +name="clinvar_sig" +type="String" + +# calculate allele frequencies for all populations. +[[postannotation]] +fields=["ac_exac_all", "an_exac_all"] +name="af_exac_all" +op="div2" +type="Float" + + [[postannotation]] + fields=["ac_adj_exac_afr", "an_adj_exac_afr"] + name="af_adj_exac_afr" + op="div2" + type="Float" + + [[postannotation]] + fields=["ac_adj_exac_amr", "an_adj_exac_amr"] + name="af_adj_exac_amr" + op="div2" + type="Float" + + [[postannotation]] + fields=["ac_adj_exac_eas", "an_adj_exac_eas"] + name="af_adj_exac_eas" + op="div2" + type="Float" + + [[postannotation]] + fields=["ac_adj_exac_fin", "an_adj_exac_fin"] + name="af_adj_exac_fin" + op="div2" + type="Float" + + [[postannotation]] + fields=["ac_adj_exac_nfe", "an_adj_exac_nfe"] + name="af_adj_exac_nfe" + op="div2" + type="Float" + + [[postannotation]] + fields=["ac_adj_exac_oth", "an_adj_exac_oth"] + name="af_adj_exac_oth" + op="div2" + type="Float" + + [[postannotation]] + fields=["ac_adj_exac_sas", "an_adj_exac_sas"] + name="af_adj_exac_sas" + op="div2" + type="Float" + + + [[postannotation]] + fields=['af_adj_exac_afr', 'af_adj_exac_amr', 'af_adj_exac_eas', 'af_adj_exac_fin', 'af_adj_exac_nfe', 'af_adj_exac_oth', 'af_adj_exac_sas', "af_esp_ea", "af_esp_aa", "af_esp_all", "af_1kg_amr", "af_1kg_eas", "af_1kg_sas", "af_1kg_afr", "af_1kg_eur", "af_1kg_all"] + op="max" + name="max_aaf_all" + type="Float" + + [[postannotation]] + fields=["clinvar_sig", "max_aaf_all"] + op="lua:check_clinvar_aaf(clinvar_sig, max_aaf_all, 0.005)" + name="common_pathogenic" + type="Flag" + + [[annotation]] + file="cosmic-v68-GRCh37.tidy.vcf.gz" + fields=["ID"] + names=["cosmic_ids"] + ops=["uniq"] + + + [[annotation]] + file="hg19_fitcons_fc-i6-0_V1-01.bed.gz" + columns=[4] + names=["fitcons_float"] + ops=["mean"] + + + + #[[annotation]] + #file="LCR-hs37d5.bed.gz" + #names=["LCR"] + #columns=[2] + #ops=["flag"] + # + #[[annotation]] + #http://humanparalogy.gs.washington.edu/build37/build37.htm + # wget -O - http://humanparalogy.gs.washington.edu/build37/data/GRCh37GenomicSuperDup.tab \ + # | tail -n+2 \ + # | grep -Pv "_gl00|_random|chrUn" | sort -k1,1V -k2,2n | bgzip -c > GRCh37GenomicSuperDup.tab.gz + #file="GRCh37GenomicSuperDup.tab.gz" + #names=["superdup"] + #columns=[2] + #ops=["flag"] + + + + [[annotation]] + file="encode.6celltypes.consensus.bedg.gz" + #chrom start end gm12878 h1hesc helas3 hepg2 huvec k562 + columns=[4,5,6,7,8,9] + ops=["concat","concat", "concat", "concat", "concat", "concat"] + names=[ "encode_consensus_gm12878", "encode_consensus_h1hesc", "encode_consensus_helas3", "encode_consensus_hepg2", "encode_consensus_huvec", "encode_consensus_k562"] + + [[annotation]] + file="hg19.gwas.bed.gz" + columns=[4] + names=["gwas_pubmed_trait"] + ops=["uniq"] + + [[annotation]] + file="hg19.rmsk.bed.gz" + columns=[4] + names=["rmsk"] + ops=["uniq"] + + [[annotation]] + file="hg19.gerp.elements.bed.gz" + columns=[4] + names=["gerp_elements"] + ops=["mean"] + + [[annotation]] + file="hg19.CpG.bed.gz" + columns=[2] + names=["cpg_island"] + ops=["flag"] + + [[annotation]] + file="hg19.dgv.bed.gz" + columns=[4] + names=["dgv"] + ops=["uniq"] + + [[annotation]] + file="wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz" + columns=[4] + names=["tfbs"] + ops=["uniq"] + + [[annotation]] + file="genetic_map_HapMapII_GRCh37.gz" + columns=[4, 5] + ops=["mean", "mean"] + names=["hapmap1", "hapmap2"] + + + [[annotation]] + file="stam.125cells.dnaseI.hg19.bed.gz" + columns=[5, 6] + ops=["mean", "uniq"] + names=["stam_mean", "stam_names"] + + [[annotation]] + file="cse-hiseq-8_4-2013-02-20.bed.gz" + columns=[2] + ops=["flag"] + names=["cse-hiseq"] + \ No newline at end of file diff --git a/cre.vcfanno.lua b/cre.vcfanno.lua new file mode 100644 index 0000000..eb2675a --- /dev/null +++ b/cre.vcfanno.lua @@ -0,0 +1,142 @@ +-- from bcbio/genomes/Hsapiens/GRCh37/configs/vcfanno/gemini.lua +function mean(vals) + local sum=0 + for i=1,#vals do + sum = sum + vals[i] + end + return sum / #vals +end + +function loc(chrom, start, stop) + return chrom .. ":" .. start .. "-" .. stop +end + +CLINVAR_LOOKUP = {} +CLINVAR_LOOKUP['0'] = 'unknown' +CLINVAR_LOOKUP['1'] = 'germline' +CLINVAR_LOOKUP['2'] = 'somatic' +CLINVAR_LOOKUP['4'] = 'inherited' +CLINVAR_LOOKUP['8'] = 'paternal' +CLINVAR_LOOKUP['16'] = 'maternal' +CLINVAR_LOOKUP['32'] = 'de-novo' +CLINVAR_LOOKUP['64'] = 'biparental' +CLINVAR_LOOKUP['128'] = 'uniparental' +CLINVAR_LOOKUP['256'] = 'not-tested' +CLINVAR_LOOKUP['512'] = 'tested-inconclusive' +CLINVAR_LOOKUP['1073741824'] = 'other' + +CLINVAR_SIG = {} +CLINVAR_SIG['0'] = 'uncertain' +CLINVAR_SIG['1'] = 'not-provided' +CLINVAR_SIG['2'] = 'benign' +CLINVAR_SIG['3'] = 'likely-benign' +CLINVAR_SIG['4'] = 'likely-pathogenic' +CLINVAR_SIG['5'] = 'pathogenic' +CLINVAR_SIG['6'] = 'drug-response' +CLINVAR_SIG['7'] = 'histocompatibility' +CLINVAR_SIG['255'] = 'other' +CLINVAR_SIG['.'] = '.' + +function intotbl(ud) + local tbl = {} + for i=1,#ud do + tbl[i] = ud[i] + end + return tbl +end + +-- from lua-users wiki +function split(str, sep) + local sep, fields = sep or ":", {} + local pattern = string.format("([^%s]+)", sep) + str:gsub(pattern, function(c) fields[#fields+1] = c end) + return fields +end + +function contains(str, tok) + return string.find(str, tok) ~= nil +end + + +function div2(a, b) + if(a == 0) then return "0.0" end + return string.format("%.9f", (a + 0) / b) +end + +function ratio(vals) + vals = vals[1] -- get 2 values per element. ref and alt counts. + if vals[2] == 0 then return "0.0" end + return string.format("%.9f", vals[2] / (vals[1] + vals[2])) +end + +function clinvar_sig(vals) + local t = type(vals) + -- just a single-value + if(t == "string" or t == "number") and not contains(vals, "|") then + return CLINVAR_SIG[vals] + elseif t ~= "table" then + if not contains(t, "userdata") then + if t == "string" then + vals = split(vals, ",") + else + vals = {vals} + end + else + vals = intotbl(vals) + end + end + local ret = {} + for i=1,#vals do + if not contains(vals[i], "|") then + ret[#ret+1] = CLINVAR_SIG[vals[i]] + else + local invals = split(vals[i], "|") + local inret = {} + for j=1,#invals do + inret[#inret+1] = CLINVAR_SIG[invals[j]] + end + ret[#ret+1] = join(inret, "|") + end + end + return join(ret, ",") +end + +join = table.concat + +function check_clinvar_aaf(clinvar_sig, max_aaf_all, aaf_cutoff) + -- didn't find an aaf for this so can't be common + if max_aaf_all == nil or clinvar_sig == nil then + return false + end + if type(clinvar_sig) ~= "string" then + clinvar_sig = join(clinvar_sig, ",") + end + if false == contains(clinvar_sig, "pathogenic") then + return false + end + if type(max_aaf_all) ~= "table" then + return max_aaf_all > aaf_cutoff + end + for i, aaf in pairs(max_aaf_all) do + if aaf > aaf_cutoff then + return true + end + end + return false +end + + +function setid(...) + local t = {...} + local res = {} + local seen = {} + for i, v in pairs(t) do + if v ~= "." and v ~= nil and v ~= "" then + if seen[v] == nil then + res[#res+1] = string.gsub(v, ",", ";") + seen[v] = true + end + end + end + return table.concat(res, ";") +end