From 61b977ef65a126455f8c6d4e1a20b5de5b4aced9 Mon Sep 17 00:00:00 2001 From: Emily Fertig Date: Tue, 1 Oct 2024 18:25:38 +0000 Subject: [PATCH 1/5] Finish inference_gendb method. --- cxx/gendb.cc | 25 +++++++++++++++++++++++-- cxx/gendb_test.cc | 7 +++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/cxx/gendb.cc b/cxx/gendb.cc index 2cb4d05..08a7501 100644 --- a/cxx/gendb.cc +++ b/cxx/gendb.cc @@ -3,6 +3,7 @@ #include "gendb.hh" +#include #include #include #include @@ -136,9 +137,14 @@ void GenDB::sample_and_incorporate_reference( auto [ref_field, class_item] = ref_key; int new_val; if (new_rows_have_unique_entities) { - new_val = entity_crps[ref_class].max_table() + 1; + auto it = domain_crps[ref_class].tables.rbegin(); + if (it == domain_crps[ref_class].tables.rend()) { + new_val = 0; + } else { + new_val = it->first + 1; + } } else { - new_val = entity_crps[ref_class].sample(prng); + new_val = domain_crps[ref_class].sample(prng); } // Generate a unique ID for the sample and incorporate it into the @@ -236,6 +242,13 @@ T_items GenDB::sample_class_ancestors(std::mt19937* prng, T_items ref_items = sample_class_ancestors( prng, cv->class_name, reference_values.at(class_name).at(ref_key), new_rows_have_unique_entities); +======= + sample_and_incorporate_reference(prng, class_name, ref_key, + cv->class_name); + } + T_items ref_items = sample_class_ancestors( + prng, cv->class_name, reference_values.at(class_name).at(ref_key)); +>>>>>>> 9358ed2 (Finish inference_gendb method.) items.insert(items.end(), ref_items.begin(), ref_items.end()); } } @@ -491,6 +504,7 @@ double GenDB::unincorporate_from_domain_cluster_relation( unincorporated[{irm_code, ref_class, item}] = cluster_id; // Recursively check and unincorporate the entity's ancestors. +<<<<<<< HEAD if (relation_reference_indices.contains(r) && relation_reference_indices.at(r).contains(ind)) { for (auto [name, r_ind] : relation_reference_indices.at(r).at(ind)) { @@ -643,6 +657,11 @@ void GenDB::transition_reference(std::mt19937* prng, const std::string& ref_field, const int class_item) { // Get the Gibbs probabilities for the entity CRP of the reference value. + std::cerr << "a" << std::endl; + auto x = schema.classes.at(class_name); + std::cerr << "aa" << std::endl; + std::cerr << "ref field " << ref_field << std::endl; + // for (auto [a, b] : ) const std::string& ref_class = std::get(schema.classes.at(class_name).vars.at(ref_field).spec) .class_name; @@ -654,6 +673,7 @@ void GenDB::transition_reference(std::mt19937* prng, return; } + std::cerr << "just got gibbs probs" << std::endl; // For each relation, get the indices (in the items vector) of the reference // value being transitioned. std::map> domain_inds = @@ -708,6 +728,7 @@ void GenDB::transition_reference(std::mt19937* prng, unincorporated_from_entity_crps); } + std::cerr << "starting loop " << std::endl; // Loop over the candidate reference values and compute the logp of each. int i = 0; for (const auto& [table, n_customers] : crp_dist) { diff --git a/cxx/gendb_test.cc b/cxx/gendb_test.cc index 35259e1..b94bc24 100644 --- a/cxx/gendb_test.cc +++ b/cxx/gendb_test.cc @@ -1032,4 +1032,11 @@ observe gendb.transition_reference_class_and_ancestors(&prng, "Record"); } +BOOST_AUTO_TEST_CASE(test_transition_reference_class) { + std::mt19937 prng; + GenDB gendb(&prng, schema); + setup_gendb(&prng, gendb, 20); + gendb.transition_reference_class_and_ancestors(&prng, "Record"); +} + BOOST_AUTO_TEST_SUITE_END() From ca39ec77a24384b96bd53facab18517008fdab55 Mon Sep 17 00:00:00 2001 From: Emily Fertig Date: Wed, 2 Oct 2024 18:22:07 +0000 Subject: [PATCH 2/5] Fixes after rebasing. --- cxx/gendb.cc | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/cxx/gendb.cc b/cxx/gendb.cc index 08a7501..292dc83 100644 --- a/cxx/gendb.cc +++ b/cxx/gendb.cc @@ -3,7 +3,6 @@ #include "gendb.hh" -#include #include #include #include @@ -242,13 +241,6 @@ T_items GenDB::sample_class_ancestors(std::mt19937* prng, T_items ref_items = sample_class_ancestors( prng, cv->class_name, reference_values.at(class_name).at(ref_key), new_rows_have_unique_entities); -======= - sample_and_incorporate_reference(prng, class_name, ref_key, - cv->class_name); - } - T_items ref_items = sample_class_ancestors( - prng, cv->class_name, reference_values.at(class_name).at(ref_key)); ->>>>>>> 9358ed2 (Finish inference_gendb method.) items.insert(items.end(), ref_items.begin(), ref_items.end()); } } @@ -504,7 +496,6 @@ double GenDB::unincorporate_from_domain_cluster_relation( unincorporated[{irm_code, ref_class, item}] = cluster_id; // Recursively check and unincorporate the entity's ancestors. -<<<<<<< HEAD if (relation_reference_indices.contains(r) && relation_reference_indices.at(r).contains(ind)) { for (auto [name, r_ind] : relation_reference_indices.at(r).at(ind)) { @@ -657,11 +648,6 @@ void GenDB::transition_reference(std::mt19937* prng, const std::string& ref_field, const int class_item) { // Get the Gibbs probabilities for the entity CRP of the reference value. - std::cerr << "a" << std::endl; - auto x = schema.classes.at(class_name); - std::cerr << "aa" << std::endl; - std::cerr << "ref field " << ref_field << std::endl; - // for (auto [a, b] : ) const std::string& ref_class = std::get(schema.classes.at(class_name).vars.at(ref_field).spec) .class_name; @@ -728,7 +714,6 @@ void GenDB::transition_reference(std::mt19937* prng, unincorporated_from_entity_crps); } - std::cerr << "starting loop " << std::endl; // Loop over the candidate reference values and compute the logp of each. int i = 0; for (const auto& [table, n_customers] : crp_dist) { From 979be62630c76c947effa4d231d87cf2abb1c676 Mon Sep 17 00:00:00 2001 From: Emily Fertig Date: Tue, 1 Oct 2024 23:37:27 +0000 Subject: [PATCH 3/5] Handle new_entities_have_new_parts separately instead of plumbing it through methods that it isn't relevant to. --- cxx/gendb.cc | 97 ++++++++++++++++++++++++++++++++-------------------- cxx/gendb.hh | 11 +++--- 2 files changed, 67 insertions(+), 41 deletions(-) diff --git a/cxx/gendb.cc b/cxx/gendb.cc index 292dc83..4970528 100644 --- a/cxx/gendb.cc +++ b/cxx/gendb.cc @@ -53,9 +53,6 @@ void GenDB::incorporate( bool new_rows_have_unique_entities) { int id = row.first; - // TODO: Consider not walking the DAG when new_rows_have_unique_entities = - // True. - // Maps a query relation name to an observed value. std::map vals = row.second; @@ -64,15 +61,56 @@ void GenDB::incorporate( // Sample a set of items to be incorporated into the query relation. const std::vector& class_path = schema.query.fields.at(query_rel).class_path; - T_items items = sample_entities_relation( - prng, schema.query.record_class, class_path.cbegin(), class_path.cend(), - id, new_rows_have_unique_entities); + + T_items items; + if (new_rows_have_unique_entities) { + const std::vector& domains = std::visit( + [&](auto tr) { return tr.domains; }, hirm->schema.at(query_rel)); + items.resize(domains.size()); + get_unique_entities_relation(query_rel, items.size() - 1, id, items); + } else { + items = + sample_entities_relation(prng, schema.query.record_class, + class_path.cbegin(), class_path.cend(), id); + } // Incorporate the items/value into the query relation. incorporate_query_relation(prng, query_rel, items, val); } } +void GenDB::get_unique_entities_relation(const std::string& rel_name, + const int ind, const int class_item, + T_items& items) { + const std::vector& domains = std::visit( + [&](auto tr) { return tr.domains; }, hirm->schema.at(rel_name)); + items[ind] = class_item; + auto& ref_indices = relation_reference_indices; + if (ref_indices.contains(rel_name)) { + if (ref_indices.at(rel_name).contains(ind)) { + for (const auto& [rf_name, rf_ind] : ref_indices.at(rel_name).at(ind)) { + if (!reference_values.at(domains[ind]).contains({rf_name, class_item})) { + int new_val; + const std::string& ref_class = domains.at(rf_ind); + if (domain_crps.at(ref_class).tables.size() == 0) { + new_val = 0; + } else { + auto it = domain_crps.at(ref_class).tables.rbegin(); + new_val = it->first + 1; + } + int new_id = get_reference_id(domains[ind], rf_name, class_item); + + reference_values.at(domains[ind])[{rf_name, class_item}] = new_val; + domain_crps.at(ref_class).incorporate(new_id, new_val); + } + + int refval = reference_values.at(domains[ind]).at({rf_name, class_item}); + get_unique_entities_relation(rel_name, rf_ind, refval, items); + } + } + } +} + // This function walks the class_path of the query, populates the global // reference_values table if necessary, and returns a sampled set of items // for the query relation that corresponds to the class path. class_path_start @@ -80,15 +118,13 @@ void GenDB::incorporate( T_items GenDB::sample_entities_relation( std::mt19937* prng, const std::string& class_name, std::vector::const_iterator class_path_start, - std::vector::const_iterator class_path_end, int class_item, - bool new_rows_have_unique_entities) { + std::vector::const_iterator class_path_end, int class_item) { if (class_path_end - class_path_start == 1) { // The last item in class_path is the class from which the queried attribute // is observed (for which there's a corresponding clean relation, observing // the attribute from the class). We need to DFS-traverse the class's // parents, similar to PCleanSchemaHelper::compute_domains_for. - return sample_class_ancestors(prng, class_name, class_item, - new_rows_have_unique_entities); + return sample_class_ancestors(prng, class_name, class_item); } // These are noisy relation domains along the path from the latent cleanly- @@ -102,13 +138,12 @@ T_items GenDB::sample_entities_relation( .class_name; std::pair ref_key = {ref_field, class_item}; if (!reference_values.at(class_name).contains(ref_key)) { - sample_and_incorporate_reference(prng, class_name, ref_key, ref_class, - new_rows_have_unique_entities); + sample_and_incorporate_reference(prng, class_name, ref_key, ref_class); } - T_items items = sample_entities_relation( - prng, ref_class, ++class_path_start, class_path_end, - reference_values.at(class_name).at(ref_key), - new_rows_have_unique_entities); + T_items items = + sample_entities_relation( + prng, ref_class, ++class_path_start, class_path_end, + reference_values.at(class_name).at(ref_key)); // The order of the items corresponds to the order of the relation's domains, // with the class (domain) corresponding to the primary key placed last on the // list. @@ -131,20 +166,10 @@ int GenDB::get_reference_id(const std::string& class_name, // and stores the value in reference_values. void GenDB::sample_and_incorporate_reference( std::mt19937* prng, const std::string& class_name, - const std::pair& ref_key, const std::string& ref_class, - bool new_rows_have_unique_entities) { + const std::pair& ref_key, + const std::string& ref_class) { auto [ref_field, class_item] = ref_key; - int new_val; - if (new_rows_have_unique_entities) { - auto it = domain_crps[ref_class].tables.rbegin(); - if (it == domain_crps[ref_class].tables.rend()) { - new_val = 0; - } else { - new_val = it->first + 1; - } - } else { - new_val = domain_crps[ref_class].sample(prng); - } + int new_val = domain_crps[ref_class].sample(prng); // Generate a unique ID for the sample and incorporate it into the // entity CRP. @@ -221,8 +246,7 @@ void GenDB::sample_and_incorporate_for_class(std::mt19937* prng, // reference_values table/entity CRPs) if necessary. T_items GenDB::sample_class_ancestors(std::mt19937* prng, const std::string& class_name, - int class_item, - bool new_rows_have_unique_entities) { + int class_item) { T_items items; assert(schema.classes.contains(class_name)); PCleanClass c = schema.classes.at(class_name); @@ -234,13 +258,11 @@ T_items GenDB::sample_class_ancestors(std::mt19937* prng, std::pair ref_key = {name, class_item}; if (!reference_values.at(class_name).contains(ref_key)) { assert(prng != nullptr); - sample_and_incorporate_reference(prng, class_name, ref_key, - cv->class_name, - new_rows_have_unique_entities); + sample_and_incorporate_reference( + prng, class_name, ref_key, cv->class_name); } T_items ref_items = sample_class_ancestors( - prng, cv->class_name, reference_values.at(class_name).at(ref_key), - new_rows_have_unique_entities); + prng, cv->class_name, reference_values.at(class_name).at(ref_key)); items.insert(items.end(), ref_items.begin(), ref_items.end()); } } @@ -615,6 +637,7 @@ double GenDB::unincorporate_singleton( double logp_refclass = 0.; int ref_val = reference_values.at(class_name).at({ref_field, class_item}); + T_items base_items = sample_class_ancestors(prng, ref_class, ref_val); logp_refclass += unincorporate_from_entity_cluster(class_name, ref_field, class_item, unincorporated_from_entity_crps, false); @@ -733,7 +756,7 @@ void GenDB::transition_reference(std::mt19937* prng, // Sample and incorporate a new row into the ref_class table. Update // reference_values and entity_crps. T_items unused_base_items = - sample_class_ancestors(prng, ref_class, table, false); + sample_class_ancestors(prng, ref_class, table); // Sample and incorporate values into the relations corresponding to // the reference class. This may also incorporate new values into the IRM diff --git a/cxx/gendb.hh b/cxx/gendb.hh index 6bb0c93..09ae9b7 100644 --- a/cxx/gendb.hh +++ b/cxx/gendb.hh @@ -42,6 +42,11 @@ class GenDB { void sample_and_incorporate_reference( std::mt19937* prng, const std::string& class_name, + const std::pair& ref_key, + const std::string& ref_class); + + void get_unique_entities_relation(const std::string& rel_name, const int ind, + const int class_item, T_items& items); const std::pair& ref_key, const std::string& ref_class, bool new_rows_have_unique_entities); @@ -50,8 +55,7 @@ class GenDB { T_items sample_entities_relation( std::mt19937* prng, const std::string& class_name, std::vector::const_iterator class_path_start, - std::vector::const_iterator class_path_end, int class_item, - bool new_rows_have_unique_entities); + std::vector::const_iterator class_path_end, int class_item); // Samples and incorporates a value into all relations belonging to class_name // (including class attributes and noisy observations of ancestor class @@ -62,8 +66,7 @@ class GenDB { // Sample items from a class' ancestors (recursive reference fields). T_items sample_class_ancestors(std::mt19937* prng, - const std::string& class_name, int class_item, - bool new_rows_have_unique_entities); + const std::string& class_name, int class_item); // Populates "items" with entities by walking the DAG of reference indices, // starting with "ind". From a72aeacf5dff7b6a4bb5fd1288526c36dac2662e Mon Sep 17 00:00:00 2001 From: Emily Fertig Date: Thu, 3 Oct 2024 13:45:44 +0000 Subject: [PATCH 4/5] Update pclean_lib. --- cxx/pclean/pclean_lib.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cxx/pclean/pclean_lib.cc b/cxx/pclean/pclean_lib.cc index 9b1f23e..25dcbbe 100644 --- a/cxx/pclean/pclean_lib.cc +++ b/cxx/pclean/pclean_lib.cc @@ -63,7 +63,7 @@ void make_pclean_sample( T_items entities = gendb->sample_entities_relation( prng, gendb->schema.query.record_class, query_field.class_path.begin(), query_field.class_path.end(), - class_item, false); + class_item); (*query_values)[query_field.name] = gendb->hirm->sample_and_incorporate_relation( prng, query_field.name, entities); From 7aa4920a68d6cdade504e3ed30979900b2a3f34d Mon Sep 17 00:00:00 2001 From: Emily Fertig Date: Fri, 4 Oct 2024 19:33:40 +0000 Subject: [PATCH 5/5] Resolve merge conflicts. --- cxx/gendb.cc | 40 +++++++++++++++------------------------- cxx/gendb.hh | 2 -- cxx/gendb_test.cc | 7 ------- 3 files changed, 15 insertions(+), 34 deletions(-) diff --git a/cxx/gendb.cc b/cxx/gendb.cc index 4970528..8bfc2e0 100644 --- a/cxx/gendb.cc +++ b/cxx/gendb.cc @@ -89,22 +89,16 @@ void GenDB::get_unique_entities_relation(const std::string& rel_name, if (ref_indices.contains(rel_name)) { if (ref_indices.at(rel_name).contains(ind)) { for (const auto& [rf_name, rf_ind] : ref_indices.at(rel_name).at(ind)) { - if (!reference_values.at(domains[ind]).contains({rf_name, class_item})) { - int new_val; + if (!reference_values.at(domains[ind]) + .contains({rf_name, class_item})) { const std::string& ref_class = domains.at(rf_ind); - if (domain_crps.at(ref_class).tables.size() == 0) { - new_val = 0; - } else { - auto it = domain_crps.at(ref_class).tables.rbegin(); - new_val = it->first + 1; - } + int new_val = entity_crps.at(ref_class).max_table() + 1; int new_id = get_reference_id(domains[ind], rf_name, class_item); - reference_values.at(domains[ind])[{rf_name, class_item}] = new_val; - domain_crps.at(ref_class).incorporate(new_id, new_val); + entity_crps.at(ref_class).incorporate(new_id, new_val); } - - int refval = reference_values.at(domains[ind]).at({rf_name, class_item}); + int refval = + reference_values.at(domains[ind]).at({rf_name, class_item}); get_unique_entities_relation(rel_name, rf_ind, refval, items); } } @@ -140,10 +134,9 @@ T_items GenDB::sample_entities_relation( if (!reference_values.at(class_name).contains(ref_key)) { sample_and_incorporate_reference(prng, class_name, ref_key, ref_class); } - T_items items = - sample_entities_relation( - prng, ref_class, ++class_path_start, class_path_end, - reference_values.at(class_name).at(ref_key)); + T_items items = sample_entities_relation( + prng, ref_class, ++class_path_start, class_path_end, + reference_values.at(class_name).at(ref_key)); // The order of the items corresponds to the order of the relation's domains, // with the class (domain) corresponding to the primary key placed last on the // list. @@ -166,16 +159,15 @@ int GenDB::get_reference_id(const std::string& class_name, // and stores the value in reference_values. void GenDB::sample_and_incorporate_reference( std::mt19937* prng, const std::string& class_name, - const std::pair& ref_key, - const std::string& ref_class) { + const std::pair& ref_key, const std::string& ref_class) { auto [ref_field, class_item] = ref_key; - int new_val = domain_crps[ref_class].sample(prng); + int new_val = entity_crps.at(ref_class).sample(prng); // Generate a unique ID for the sample and incorporate it into the // entity CRP. int new_id = get_reference_id(class_name, ref_field, class_item); reference_values.at(class_name)[ref_key] = new_val; - entity_crps[ref_class].incorporate(new_id, new_val); + entity_crps.at(ref_class).incorporate(new_id, new_val); } // Incorporates an observed value into a query relation. Recursively @@ -210,7 +202,7 @@ void GenDB::sample_and_incorporate_for_class(std::mt19937* prng, const std::string& class_name, const T_item& item) { for (const std::string& rel_name : class_to_relations.at(class_name)) { - sample_class_ancestors(prng, class_name, item, false); + sample_class_ancestors(prng, class_name, item); const std::vector& domains = std::visit( [&](auto tr) { return tr.domains; }, hirm->schema.at(rel_name)); T_items rel_items(domains.size()); @@ -258,8 +250,8 @@ T_items GenDB::sample_class_ancestors(std::mt19937* prng, std::pair ref_key = {name, class_item}; if (!reference_values.at(class_name).contains(ref_key)) { assert(prng != nullptr); - sample_and_incorporate_reference( - prng, class_name, ref_key, cv->class_name); + sample_and_incorporate_reference(prng, class_name, ref_key, + cv->class_name); } T_items ref_items = sample_class_ancestors( prng, cv->class_name, reference_values.at(class_name).at(ref_key)); @@ -637,7 +629,6 @@ double GenDB::unincorporate_singleton( double logp_refclass = 0.; int ref_val = reference_values.at(class_name).at({ref_field, class_item}); - T_items base_items = sample_class_ancestors(prng, ref_class, ref_val); logp_refclass += unincorporate_from_entity_cluster(class_name, ref_field, class_item, unincorporated_from_entity_crps, false); @@ -682,7 +673,6 @@ void GenDB::transition_reference(std::mt19937* prng, return; } - std::cerr << "just got gibbs probs" << std::endl; // For each relation, get the indices (in the items vector) of the reference // value being transitioned. std::map> domain_inds = diff --git a/cxx/gendb.hh b/cxx/gendb.hh index 09ae9b7..9485974 100644 --- a/cxx/gendb.hh +++ b/cxx/gendb.hh @@ -47,8 +47,6 @@ class GenDB { void get_unique_entities_relation(const std::string& rel_name, const int ind, const int class_item, T_items& items); - const std::pair& ref_key, const std::string& ref_class, - bool new_rows_have_unique_entities); // Samples a set of entities in the domains of the relation corresponding to // class_path. diff --git a/cxx/gendb_test.cc b/cxx/gendb_test.cc index b94bc24..35259e1 100644 --- a/cxx/gendb_test.cc +++ b/cxx/gendb_test.cc @@ -1032,11 +1032,4 @@ observe gendb.transition_reference_class_and_ancestors(&prng, "Record"); } -BOOST_AUTO_TEST_CASE(test_transition_reference_class) { - std::mt19937 prng; - GenDB gendb(&prng, schema); - setup_gendb(&prng, gendb, 20); - gendb.transition_reference_class_and_ancestors(&prng, "Record"); -} - BOOST_AUTO_TEST_SUITE_END()