Skip to content

Commit

Permalink
Merge pull request jeromekelleher#321 from szhan/ignore_excluded_site…
Browse files Browse the repository at this point in the history
…s_base_composition

Disregard excluded sites when computing base composition
  • Loading branch information
jeromekelleher authored Oct 1, 2024
2 parents 45a335a + 9f85a0c commit 3fa1e58
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
22 changes: 19 additions & 3 deletions sc2ts/alignments.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,18 @@ def decode_alignment(a):
return alleles[a]


def base_composition(haplotype):
return collections.Counter(haplotype)
def base_composition(haplotype, excluded_sites=None):
"""
Haplotype includes an arbitrary character at the start.
Also, excluded site positions are 1-based.
"""
if excluded_sites is not None:
mask = np.zeros(len(haplotype), dtype=bool)
mask[excluded_sites] = True
# Remove the first site from both haplotype and mask.
masked_haplotype = haplotype[1:][~mask[1:]]
return collections.Counter(masked_haplotype)
return collections.Counter(haplotype[1:])


def compress_alignment(a):
Expand Down Expand Up @@ -145,12 +155,14 @@ class MaskedAlignment:
masked_sites: np.ndarray
original_base_composition: dict
original_md5: str
masked_base_composition: str

def qc_summary(self):
return {
"num_masked_sites": self.masked_sites.shape[0],
"original_base_composition": self.original_base_composition,
"original_md5": self.original_md5,
"masked_base_composition": self.masked_base_composition,
}


Expand All @@ -161,6 +173,10 @@ def encode_and_mask(alignment, window_size=7):
return MaskedAlignment(
alignment=a,
masked_sites=np.array(masked_sites, dtype=int),
original_base_composition=base_composition(alignment[1:]),
original_base_composition=base_composition(haplotype=alignment),
original_md5=hashlib.md5(alignment[1:]).hexdigest(),
masked_base_composition=base_composition(
haplotype=alignment,
excluded_sites=masked_sites,
),
)
6 changes: 6 additions & 0 deletions tests/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,12 @@ def test_first_day(self, tmp_path, fx_ts_map, fx_alignment_store, fx_metadata_db
"T": 9566,
},
"original_md5": "e96feaa72c4f4baba73c2e147ede7502",
"masked_base_composition": {
'A': 8891,
'C': 5468,
'G': 5849,
'T': 9562,
},
}

ts.tables.assert_equals(fx_ts_map["2020-01-19"].tables, ignore_provenance=True)
Expand Down

0 comments on commit 3fa1e58

Please sign in to comment.