Skip to content

Commit

Permalink
Add more optional --report fields PlantandFoodResearch#174
Browse files Browse the repository at this point in the history
- Allow specification of INFO and FORMAT field variants
- Add ACP field
- Add INFO/AOPSUM field
- Add SNVDP field
  • Loading branch information
timothymillar committed May 9, 2024
1 parent 6032359 commit f3fabf9
Show file tree
Hide file tree
Showing 15 changed files with 203 additions and 112 deletions.
22 changes: 15 additions & 7 deletions mchap/application/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from mchap.constant import PFEIFFER_ERROR
from mchap.io import extract_sample_ids
from mchap.io.vcf.infofields import OPTIONAL_INFO_FIELDS
from mchap.io.vcf.formatfields import OPTIONAL_FORMAT_FIELDS


@dataclass
Expand Down Expand Up @@ -288,20 +290,26 @@ def add_to(self, parser):
),
)


_optional_field_descriptions = [
"INFO/{} = {}".format(f.id, f.descr) for f in OPTIONAL_INFO_FIELDS
]
_optional_field_descriptions += [
"FORMAT/{}: {}".format(f.id, f.descr) for f in OPTIONAL_FORMAT_FIELDS
]

report = Parameter(
"--report",
dict(
type=str,
nargs="*",
default=[],
help=(
"Extra fields to report within the output VCF: "
"AFPRIOR = prior allele frequencies; "
"AFP = posterior mean allele frequencies; "
"AOP = posterior probability of allele occurring at any copy number; "
"GP = genotype posterior probabilities; "
"GL = genotype likelihoods."
),
"Extra fields to report within the output VCF. "
"The INFO/FORMAT prefix may be omitted to return both "
"variations of the named field. Options include: "
)
+ "; ".join(_optional_field_descriptions),
),
)

Expand Down
6 changes: 5 additions & 1 deletion mchap/application/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def call_sample_genotypes(self, data):
"MCI",
"GL",
"GP",
"ACP",
"AFP",
"AOP",
]:
Expand Down Expand Up @@ -211,7 +212,7 @@ def call_sample_genotypes(self, data):
data.sampledata["alleles"][sample] = alleles

# posterior allele frequencies/occurrences if requested
if ("AFP" in data.formatfields) or ("AOP" in data.formatfields):
if self.require_AFP():
frequencies = np.zeros(len(haplotypes))
occurrences = np.zeros(len(haplotypes))
haps, freqs, occur = sample_posteriors[sample].allele_frequencies()
Expand All @@ -224,6 +225,9 @@ def call_sample_genotypes(self, data):
data.sampledata["AOP"][sample] = np.round(
occurrences, self.precision
)
data.sampledata["ACP"][sample] = np.round(
frequencies * data.sample_ploidy[sample], self.precision
)

# encode posterior probabilities if requested
if "GP" in data.formatfields:
Expand Down
61 changes: 42 additions & 19 deletions mchap/application/baseclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
encode_read_distributions,
vcf,
)
from mchap.io.vcf.infofields import HEADER_INFO_FIELDS
from mchap.io.vcf.formatfields import HEADER_FORMAT_FIELDS
from mchap.io.vcf.infofields import HEADER_INFO_FIELDS, OPTIONAL_INFO_FIELDS
from mchap.io.vcf.formatfields import HEADER_FORMAT_FIELDS, OPTIONAL_FORMAT_FIELDS

import warnings

Expand Down Expand Up @@ -77,7 +77,11 @@ def info_fields(self):
"END",
"NVAR",
"SNVPOS",
] + [f for f in ["AFPRIOR", "AFP", "AOP"] if f in self.report_fields]
]
for f in OPTIONAL_INFO_FIELDS:
id = f.id
if (id in self.report_fields) or (f"INFO/{id}" in self.report_fields):
infofields.append(id)
return infofields

def format_fields(self):
Expand All @@ -93,9 +97,20 @@ def format_fields(self):
"GPM",
"PHPM",
"MCI",
] + [f for f in ["AFP", "AOP", "DS", "GP", "GL"] if f in self.report_fields]
]
for f in OPTIONAL_FORMAT_FIELDS:
id = f.id
if (id in self.report_fields) or (f"FORMAT/{id}" in self.report_fields):
formatfields.append(id)
return formatfields

def require_AFP(self):
requested = set(self.info_fields()) | set(self.format_fields())
if {"ACP", "AFP", "AOP", "AOPSUM"} & requested:
return True
else:
return False

def loci(self):
raise NotImplementedError()

Expand Down Expand Up @@ -162,6 +177,7 @@ def encode_sample_reads(self, data):
"DP",
"RCOUNT",
"RCALLS",
"SNVDP",
"read_calls",
"read_dists_unique",
"read_dist_counts",
Expand Down Expand Up @@ -200,12 +216,9 @@ def encode_sample_reads(self, data):
data.sampledata["RCOUNT"][sample] = read_count
read_variant_depth = character.depth(read_chars)
if len(read_variant_depth) == 0:
# no variants to score depth
data.sampledata["DP"][sample] = np.nan
else:
data.sampledata["DP"][sample] = np.round(
np.mean(read_variant_depth)
)
read_variant_depth = np.array(np.nan)
data.sampledata["DP"][sample] = np.round(np.mean(read_variant_depth))
data.sampledata["SNVDP"][sample] = np.round(read_variant_depth)

# encode reads as alleles and probabilities
read_calls = encode_read_alleles(locus, read_chars)
Expand Down Expand Up @@ -318,22 +331,32 @@ def sumarise_vcf_record(self, data):
data.infodata["DP"] = np.nansum(list(data.sampledata["DP"].values()))
# total read count
data.infodata["RCOUNT"] = np.nansum(list(data.sampledata["RCOUNT"].values()))
# population mean posterior allele frequencies
n_allele = len(data.columndata["ALTS"]) + 1
null_length_R = np.full(n_allele, np.nan)
if "ACP" in data.infofields:
_ACP = sum(data.sampledata["AFP"].values())
_ACP = null_length_R if np.isnan(_ACP).all() else _ACP
data.infodata["ACP"] = _ACP.round(self.precision)
if "AFP" in data.infofields:
# need to weight frequencies of each individual by ploidy
pop_ploidy = 0
pop_total = np.zeros(len(data.columndata["ALTS"]) + 1, float)
for sample, freqs in data.sampledata["AFP"].items():
ploidy = self.sample_ploidy[sample]
pop_ploidy += ploidy
pop_total += freqs * ploidy
data.infodata["AFP"] = (pop_total / pop_ploidy).round(self.precision)
# use ACP to weight frequencies of each individual by ploidy
_AFP = sum(data.sampledata["ACP"].values()) / sum(
data.sample_ploidy.values()
)
_AFP = null_length_R if np.isnan(_AFP).all() else _AFP
data.infodata["AFP"] = _AFP.round(self.precision)
if "AOPSUM" in data.infofields:
_AOPSUM = sum(data.sampledata["AOP"].values())
_AOPSUM = null_length_R if np.isnan(_AOPSUM).all() else _AOPSUM
data.infodata["AOPSUM"] = _AOPSUM.round(self.precision)
if "AOP" in data.infofields:
prob_not_occurring = np.ones(len(data.columndata["ALTS"]) + 1, float)
for occur in data.sampledata["AOP"].values():
prob_not_occurring = prob_not_occurring * (1 - occur)
prob_occurring = 1 - prob_not_occurring
data.infodata["AOP"] = prob_occurring.round(self.precision)
if "SNVDP" in data.infofields:
_SNVDP = sum(data.sampledata["SNVDP"].values())
data.infodata["SNVDP"] = _SNVDP.round(self.precision)
return data

def call_locus(self, locus, sample_bams):
Expand Down
19 changes: 7 additions & 12 deletions mchap/application/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def call_sample_genotypes(self, data):
"MCI",
"GL",
"GP",
"ACP",
"AFP",
"AOP",
]:
Expand Down Expand Up @@ -129,6 +130,7 @@ def call_sample_genotypes(self, data):
data.sampledata["PHPM"][sample] = np.nan
data.sampledata["PHQ"][sample] = np.nan
data.sampledata["MCI"][sample] = np.nan
data.sampledata["ACP"][sample] = np.array([np.nan])
data.sampledata["AFP"][sample] = np.array([np.nan])
data.sampledata["AOP"][sample] = np.array([np.nan])
data.sampledata["GP"][sample] = np.array([np.nan])
Expand Down Expand Up @@ -178,22 +180,15 @@ def call_sample_genotypes(self, data):
data.sampledata["PHQ"][sample] = qual_of_prob(phenotype_prob)
data.sampledata["MCI"][sample] = incongruence

# posterior allele frequencies if requested
if "AFP" in data.formatfields:
frequencies = np.zeros(len(haplotypes))
alleles, counts = np.unique(trace.genotypes, return_counts=True)
frequencies[alleles] = counts / counts.sum()
# posterior allele frequencies/occurrence if requested
if self.require_AFP():
frequencies, counts, occurrence = trace.posterior_frequencies()
data.sampledata["ACP"][sample] = np.round(counts, self.precision)
data.sampledata["AFP"][sample] = np.round(
frequencies, self.precision
)

# posterior allele occurrence if requested
if "AOP" in data.formatfields:
occurrences = np.zeros(len(haplotypes))
alleles, _, occur = posterior.allele_frequencies()
occurrences[alleles] = occur
data.sampledata["AOP"][sample] = np.round(
occurrences, self.precision
occurrence, self.precision
)

# genotype posteriors if requested
Expand Down
28 changes: 17 additions & 11 deletions mchap/application/call_exact.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def call_sample_genotypes(self, data):
"MCI",
"GL",
"GP",
"ACP",
"AFP",
"AOP",
]:
Expand Down Expand Up @@ -114,6 +115,7 @@ def call_sample_genotypes(self, data):
data.sampledata["PHPM"][sample] = np.nan
data.sampledata["PHQ"][sample] = np.nan
data.sampledata["MCI"][sample] = np.nan
data.sampledata["ACP"][sample] = np.array([np.nan])
data.sampledata["AFP"][sample] = np.array([np.nan])
data.sampledata["AOP"][sample] = np.array([np.nan])
data.sampledata["GP"][sample] = np.array([np.nan])
Expand Down Expand Up @@ -154,10 +156,13 @@ def call_sample_genotypes(self, data):
phenotype_prob = phenotype_probs.sum()

# store specified arrays
if ("AFP" in data.formatfields) or ("AOP" in data.formatfields):
freqs, occur = posterior_allele_frequencies(
if self.require_AFP():
freqs, counts, occur = posterior_allele_frequencies(
probabilities, ploidy, len(haplotypes)
)
data.sampledata["ACP"][sample] = np.round(
counts, self.precision
)
data.sampledata["AFP"][sample] = np.round(freqs, self.precision)
data.sampledata["AOP"][sample] = np.round(occur, self.precision)
if "GL" in data.formatfields:
Expand All @@ -179,17 +184,18 @@ def call_sample_genotypes(self, data):
inbreeding=inbreeding,
frequencies=prior_frequencies,
return_phenotype_prob=True,
return_posterior_frequencies="AFP" in data.formatfields,
return_posterior_occurrence="AOP" in data.formatfields,
return_posterior_frequencies=True,
return_posterior_occurrence=True,
)
alleles, _, genotype_prob, phenotype_prob = mode_results[0:4]
if "AOP" in data.formatfields:
occur = np.round(mode_results[-1], self.precision)
data.sampledata["AOP"][sample] = occur
mode_results = mode_results[0:-1]
if "AFP" in data.formatfields:
freqs = np.round(mode_results[-1], self.precision)
data.sampledata["AFP"][sample] = freqs

freqs = np.round(mode_results[-2], self.precision)
occur = np.round(mode_results[-1], self.precision)
data.sampledata["ACP"][sample] = np.round(
freqs * ploidy, self.precision
)
data.sampledata["AFP"][sample] = np.round(freqs, self.precision)
data.sampledata["AOP"][sample] = np.round(occur, self.precision)

# store variables
data.sampledata["alleles"][sample] = alleles
Expand Down
Loading

0 comments on commit f3fabf9

Please sign in to comment.