Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor low-level code #830

Draft
wants to merge 42 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
7ef65c7
Initial rough work on getting ancstor_matcher decoupled
jeromekelleher Apr 13, 2023
540265a
Initial dump of required HMM code
jeromekelleher Apr 25, 2023
6f066cc
First steps - isolated code works with shim Builder class
jeromekelleher Apr 25, 2023
936c338
Some more tests
jeromekelleher Apr 26, 2023
2e2b192
Remove TreeSequenceBuilder
jeromekelleher Apr 26, 2023
32d580d
Fake the sortedcontainers API for now
jeromekelleher Apr 26, 2023
230bffe
Fully move to sorted lists of edges.
jeromekelleher Apr 26, 2023
d49d4a3
Stuff
jeromekelleher Apr 26, 2023
9ef666b
Partial
jeromekelleher Apr 27, 2023
3394a81
Abstract out the MatcherIndexes class
jeromekelleher Apr 27, 2023
3c6f5c0
Pull out stuff necessary for static match
jeromekelleher Apr 27, 2023
b81f408
Compiling version without tree_sequence_builder
jeromekelleher Apr 27, 2023
17b1d0f
Partway through getting C tests running
jeromekelleher Apr 28, 2023
2c59357
add test dataq
jeromekelleher Apr 28, 2023
5869f0a
C code working and some tests
jeromekelleher Apr 30, 2023
0297f36
Factor out the "output" struct in matcher
jeromekelleher May 1, 2023
698cc19
Some basics for new Matcher infrastructure
jeromekelleher May 2, 2023
5f3e7d9
compiling with lwt interface
jeromekelleher May 2, 2023
6c7748d
partial update to use table collection
jeromekelleher May 2, 2023
211aae0
Convert C code to use tables
jeromekelleher May 2, 2023
5d77002
Roughly working Python-C infrastructure
jeromekelleher May 2, 2023
5b4b02e
Add basic debug support to the MatcherIndexes
jeromekelleher May 3, 2023
b0546f6
Basic Python-C linkage works :hooray:
jeromekelleher May 3, 2023
abd8dd0
Basic high-level infrastructure for matching
jeromekelleher May 3, 2023
52d20e6
Rename file
jeromekelleher May 4, 2023
ea9bcd3
Refactor the Matcher infrastructure
jeromekelleher May 4, 2023
864d227
Improve class infrastructure
jeromekelleher May 4, 2023
6aa12ef
Add vestigial root automatically
jeromekelleher May 4, 2023
62c534c
Fix up tests to remove hard-coded virtual root
jeromekelleher May 5, 2023
5c86a84
Work on making matcher work with edges not on site values
jeromekelleher May 5, 2023
5ca0710
Roughtly working version with edges on genome coords
jeromekelleher May 6, 2023
9a7492b
Python version looks like it's working
jeromekelleher May 6, 2023
91e50fe
Add sites_position storage and coordinate_t type
jeromekelleher May 6, 2023
b8aed5b
Matching working in C (looks like)
jeromekelleher May 8, 2023
2f694bb
Infer start and end from haplotype
jeromekelleher May 10, 2023
1530fb2
Rough implementation of flank skipping
jeromekelleher May 10, 2023
7a905e7
Change python code to use coords in path
jeromekelleher May 12, 2023
854f13b
Implement coordinate paths in C
jeromekelleher May 12, 2023
0e59ceb
Implement some cludges to support initial zero site-paht
jeromekelleher May 13, 2023
ef34c12
Minor updates
jeromekelleher May 15, 2023
3626a2a
Sort-of working driver script
jeromekelleher May 15, 2023
f907061
Fiddle with some tricky issues
jeromekelleher May 16, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
461 changes: 461 additions & 0 deletions _tsinfermodule.c

Large diffs are not rendered by default.

1,096 changes: 1,095 additions & 1 deletion lib/ancestor_matcher.c

Large diffs are not rendered by default.

Binary file added lib/test_data/multi_tree_example.trees
Binary file not shown.
Binary file added lib/test_data/single_tree_example.trees
Binary file not shown.
166 changes: 165 additions & 1 deletion lib/tests/tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,39 @@

#include <CUnit/Basic.h>

/* FIXME this needs to be updated somehow to allow the tests to be run from
* different directories, i.e., with ninja -C build test
*/
#define TEST_DATA_DIR "test_data"

/* Global variables used for test in state in the test suite */

char *_tmp_file_name;
FILE *_devnull;

tsk_treeseq_t _single_tree_ex_ts;
/* 3.00┊ 0 ┊ */
/* ┊ ┃ ┊ */
/* 2.00┊ 7 ┊ */
/* ┊ ┏━┻━┓ ┊ */
/* 1.00┊ 5 6 ┊ */
/* ┊ ┏┻┓ ┏┻┓ ┊ */
/* 0.00┊ 1 2 3 4 ┊ */
/* 0 4 */
tsk_treeseq_t _multi_tree_ex_ts;
/* 1.84┊ 0 ┊ 0 ┊ */
/* ┊ ┃ ┊ ┃ ┊ */
/* 0.84┊ 8 ┊ 8 ┊ */
/* ┊ ┏━┻━┓ ┊ ┏━┻━┓ ┊ */
/* 0.42┊ ┃ ┃ ┊ 7 ┃ ┊ */
/* ┊ ┃ ┃ ┊ ┏┻┓ ┃ ┊ */
/* 0.05┊ 6 ┃ ┊ ┃ ┃ ┃ ┊ */
/* ┊ ┏━┻┓ ┃ ┊ ┃ ┃ ┃ ┊ */
/* 0.04┊ ┃ 5 ┃ ┊ ┃ ┃ 5 ┊ */
/* ┊ ┃ ┏┻┓ ┃ ┊ ┃ ┃ ┏┻┓ ┊ */
/* 0.00┊ 1 2 3 4 ┊ 1 4 2 3 ┊ */
/* 0 2 4 */

static void
dump_tree_sequence_builder(
tree_sequence_builder_t *tsb, tsk_table_collection_t *tables, tsk_flags_t options)
Expand Down Expand Up @@ -985,6 +1013,117 @@ test_packbits_errors(void)
CU_ASSERT_EQUAL_FATAL(ret, TSI_ERR_ONE_BIT_NON_BINARY);
}

static int
run_match(const tsk_treeseq_t *ts, double rho, double mu, const allele_t *h,
allele_t *match, tsk_size_t *path_length, tsk_id_t *left, tsk_id_t *right,
tsk_id_t *parent)
{
int ret;
ancestor_matcher2_t am;
matcher_indexes_t mi;
const size_t m = tsk_treeseq_get_num_sites(ts);
double *recombination_rate = calloc(m, sizeof(*recombination_rate));
double *mutation_rate = calloc(m, sizeof(*mutation_rate));
size_t j;

CU_ASSERT_FATAL(recombination_rate != NULL);
CU_ASSERT_FATAL(mutation_rate != NULL);
for (j = 0; j < m; j++) {
mutation_rate[j] = mu;
recombination_rate[j] = rho;
}

ret = matcher_indexes_alloc(&mi, ts->tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
/* matcher_indexes_print_state(&mi, stdout); */
ret = ancestor_matcher2_alloc(&am, &mi, recombination_rate, mutation_rate, 14, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = ancestor_matcher2_find_path(
&am, 0, (tsk_id_t) m, h, match, path_length, left, right, parent);
CU_ASSERT_EQUAL_FATAL(ret, 0);
/* ancestor_matcher2_print_state(&am, stdout); */

ancestor_matcher2_free(&am);
matcher_indexes_free(&mi);
free(recombination_rate);
free(mutation_rate);

return 0;
}

static void
check_matching_single_site_match(const tsk_treeseq_t *ts)
{
allele_t h[] = { 0, 0, 0, 0 };
allele_t match[4];
tsk_id_t j, left[4], right[4], parent[4];
tsk_size_t path_length;

CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_sites(ts), 4);

for (j = 0; j < 4; j++) {
memset(h, 0, sizeof(h));
h[j] = 1;
run_match(ts, 1e-8, 0, h, match, &path_length, left, right, parent);
CU_ASSERT_EQUAL_FATAL(path_length, 1);
CU_ASSERT_EQUAL_FATAL(left[0], 0);
CU_ASSERT_EQUAL_FATAL(right[0], 4);
CU_ASSERT_EQUAL_FATAL(parent[0], j + 1);
}
}

static void
test_matching_single_tree_single_site_match(void)
{
check_matching_single_site_match(&_single_tree_ex_ts);
}

static void
test_matching_multi_tree_single_site_match(void)
{
check_matching_single_site_match(&_multi_tree_ex_ts);
}

static void
check_matching_multi_switch(const tsk_treeseq_t *ts)
{
allele_t h[] = { 1, 1, 1, 1 };
allele_t match[4];
tsk_id_t left[4], right[4], parent[4];
tsk_size_t path_length;

CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_sites(ts), 4);
CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_sequence_length(ts), 4);

run_match(ts, 1e-8, 0, h, match, &path_length, left, right, parent);
CU_ASSERT_EQUAL_FATAL(path_length, 4);
CU_ASSERT_EQUAL_FATAL(left[3], 0);
CU_ASSERT_EQUAL_FATAL(right[3], 1);
CU_ASSERT_EQUAL_FATAL(parent[3], 1);
CU_ASSERT_EQUAL_FATAL(left[2], 1);
CU_ASSERT_EQUAL_FATAL(right[2], 2);
CU_ASSERT_EQUAL_FATAL(parent[2], 2);
CU_ASSERT_EQUAL_FATAL(left[1], 2);
CU_ASSERT_EQUAL_FATAL(right[1], 3);
CU_ASSERT_EQUAL_FATAL(parent[1], 3);
CU_ASSERT_EQUAL_FATAL(left[0], 3);
CU_ASSERT_EQUAL_FATAL(right[0], 4);
CU_ASSERT_EQUAL_FATAL(parent[0], 4);
}

static void
test_matching_single_tree_multi_switch(void)
{
check_matching_multi_switch(&_single_tree_ex_ts);
}

static void
test_matching_multi_tree_multi_switch(void)
{
check_matching_multi_switch(&_multi_tree_ex_ts);
}

static void
test_strerror(void)
{
Expand All @@ -1004,11 +1143,13 @@ test_strerror(void)
static int
tsinfer_suite_init(void)
{
int fd;
int ret, fd;
static char template[] = "/tmp/tsi_c_test_XXXXXX";

_tmp_file_name = NULL;
_devnull = NULL;
memset(&_single_tree_ex_ts, 0, sizeof(_single_tree_ex_ts));
memset(&_multi_tree_ex_ts, 0, sizeof(_multi_tree_ex_ts));

_tmp_file_name = malloc(sizeof(template));
if (_tmp_file_name == NULL) {
Expand All @@ -1024,6 +1165,18 @@ tsinfer_suite_init(void)
if (_devnull == NULL) {
return CUE_SINIT_FAILED;
}

ret = tsk_treeseq_load(
&_single_tree_ex_ts, TEST_DATA_DIR "/single_tree_example.trees", 0);
if (ret != 0) {
return CUE_SINIT_FAILED;
}
ret = tsk_treeseq_load(
&_multi_tree_ex_ts, TEST_DATA_DIR "/multi_tree_example.trees", 0);
if (ret != 0) {
return CUE_SINIT_FAILED;
}

return CUE_SUCCESS;
}

Expand All @@ -1037,6 +1190,8 @@ tsinfer_suite_cleanup(void)
if (_devnull != NULL) {
fclose(_devnull);
}
tsk_treeseq_free(&_single_tree_ex_ts);
tsk_treeseq_free(&_multi_tree_ex_ts);
return CUE_SUCCESS;
}

Expand Down Expand Up @@ -1077,6 +1232,15 @@ main(int argc, char **argv)
{ "test_packbits_4", test_packbits_4 },
{ "test_packbits_errors", test_packbits_errors },

{ "test_matching_single_tree_single_site_match",
test_matching_single_tree_single_site_match },
{ "test_matching_multi_tree_single_site_match",
test_matching_multi_tree_single_site_match },
{ "test_matching_single_tree_multi_switch",
test_matching_single_tree_multi_switch },
{ "test_matching_multi_tree_multi_switch",
test_matching_multi_tree_multi_switch },

{ "test_strerror", test_strerror },

CU_TEST_INFO_NULL,
Expand Down
72 changes: 70 additions & 2 deletions lib/tsinfer.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,12 @@
#define TSI_NODE_IS_PC_ANCESTOR ((tsk_flags_t)(1u << 16))

typedef int8_t allele_t;
/* TODO should probably change to uint32 when we have removed the old code.*/
typedef tsk_id_t coordinate_t;

typedef struct {
tsk_id_t left;
tsk_id_t right;
coordinate_t left;
coordinate_t right;
tsk_id_t parent;
tsk_id_t child;
} edge_t;
Expand Down Expand Up @@ -209,6 +211,54 @@ typedef struct {
} output;
} ancestor_matcher_t;

typedef struct {
tsk_flags_t flags;
size_t num_sites;
size_t num_nodes;
size_t num_mutations;
size_t num_edges;
struct {
coordinate_t *position;
mutation_list_node_t **mutations;
tsk_size_t *num_alleles;
} sites;
edge_t *left_index_edges;
edge_t *right_index_edges;
tsk_blkalloc_t allocator;
} matcher_indexes_t;

typedef struct {
int flags;
const matcher_indexes_t *matcher_indexes;
size_t num_nodes;
size_t num_sites;
/* Input LS model rates */
unsigned int precision;
double *recombination_rate;
double *mismatch_rate;
/* The quintuply linked tree */
tsk_id_t *parent;
tsk_id_t *left_child;
tsk_id_t *right_child;
tsk_id_t *left_sib;
tsk_id_t *right_sib;
double *likelihood;
double *likelihood_cache;
allele_t *allelic_state;
int num_likelihood_nodes;
/* At each site, record a node with the maximum likelihood. */
tsk_id_t *max_likelihood_node;
/* Used during traceback to map nodes where recombination is required. */
int8_t *recombination_required;
tsk_id_t *likelihood_nodes_tmp;
tsk_id_t *likelihood_nodes;
node_state_list_t *traceback;
tsk_blkalloc_t traceback_allocator;
size_t total_traceback_size;
size_t traceback_block_size;
size_t traceback_realloc_size;
} ancestor_matcher2_t;

int ancestor_builder_alloc(ancestor_builder_t *self, size_t num_samples,
size_t num_sites, int mmap_fd, int flags);
int ancestor_builder_free(ancestor_builder_t *self);
Expand Down Expand Up @@ -267,6 +317,24 @@ int tree_sequence_builder_dump_edges(tree_sequence_builder_t *self, tsk_id_t *le
int tree_sequence_builder_dump_mutations(tree_sequence_builder_t *self, tsk_id_t *site,
tsk_id_t *node, allele_t *derived_state, tsk_id_t *parent);

/* New impelementation */

int matcher_indexes_alloc(
matcher_indexes_t *self, const tsk_table_collection_t *tables, tsk_flags_t options);
int matcher_indexes_print_state(const matcher_indexes_t *self, FILE *out);
int matcher_indexes_free(matcher_indexes_t *self);

int ancestor_matcher2_alloc(ancestor_matcher2_t *self,
const matcher_indexes_t *matcher_indexes, double *recombination_rate,
double *mismatch_rate, unsigned int precision, int flags);
int ancestor_matcher2_free(ancestor_matcher2_t *self);
int ancestor_matcher2_find_path(ancestor_matcher2_t *self, tsk_id_t start, tsk_id_t end,
const allele_t *haplotype, allele_t *matched_haplotype, size_t *path_length,
tsk_id_t *path_left, tsk_id_t *path_right, tsk_id_t *path_parent);
int ancestor_matcher2_print_state(ancestor_matcher2_t *self, FILE *out);
double ancestor_matcher2_get_mean_traceback_size(ancestor_matcher2_t *self);
size_t ancestor_matcher2_get_total_memory(ancestor_matcher2_t *self);

int packbits(const allele_t *restrict source, size_t len, uint8_t *restrict dest);
void unpackbits(const uint8_t *restrict source, size_t len, allele_t *restrict dest);

Expand Down
1 change: 1 addition & 0 deletions lwt_interface
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
tskroot = os.path.join(libdir, "subprojects", "tskit")
tskdir = os.path.join(tskroot, "tskit")
kasdir = os.path.join(tskroot, "subprojects", "kastore")
includes = [libdir, tskroot, tskdir, kasdir]
includes = ["lwt_interface", libdir, tskroot, tskdir, kasdir]

tsi_source_files = [
"ancestor_matcher.c",
Expand All @@ -24,7 +24,7 @@
]
# We're not actually using very much of tskit at the moment, so
# just build the stuff we need.
tsk_source_files = ["core.c"]
tsk_source_files = ["core.c", "tables.c", "trees.c"]
kas_source_files = ["kastore.c"]

sources = (
Expand Down
29 changes: 29 additions & 0 deletions tests/test_low_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@
Integrity tests for the low-level module.
"""
import sys
import tempfile

import pytest
import tskit

import _tsinfer

Expand Down Expand Up @@ -151,3 +153,30 @@ def test_add_too_many_sites(self):
assert str(record.value) == msg

# TODO need tester methods for the remaining methonds in the class.


class TestMatcherIndexes:
def test_single_tree(self):
ts = tskit.Tree.generate_balanced(4).tree_sequence
tables = ts.dump_tables()
ll_tables = _tsinfer.LightweightTableCollection(tables.sequence_length)
ll_tables.fromdict(tables.asdict())
mi = _tsinfer.MatcherIndexes(ll_tables)
print(mi)
mi.print_state(sys.stdout)

def test_print_state(self):
ts = tskit.Tree.generate_balanced(4).tree_sequence
tables = ts.dump_tables()
ll_tables = _tsinfer.LightweightTableCollection(tables.sequence_length)
ll_tables.fromdict(tables.asdict())
mi = _tsinfer.MatcherIndexes(ll_tables)
with pytest.raises(TypeError):
mi.print_state()

with tempfile.TemporaryFile("w+") as f:
mi.print_state(f)
f.seek(0)
output = f.read()
assert len(output) > 0
assert "indexes" in output
Loading