probcomp · ThomasColthurst · Sep 11, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/cxx/pclean/pclean_lib.cc b/cxx/pclean/pclean_lib.cc
@@ -24,6 +24,7 @@ T_observations translate_observations(
     const T_relation& trel = schema.at(col_name);
     size_t num_domains = std::visit([&](const auto &r) {
       return r.domains.size();}, trel);
+    assert(num_domains == annotated_domains_for_relations.at(col_name).size());
 
     for (size_t i = 0; i < col.second.size(); ++i) {
       const std::string& val = col.second[i];

diff --git a/cxx/pclean/pclean_lib_test.cc b/cxx/pclean/pclean_lib_test.cc
@@ -32,8 +32,8 @@ BOOST_AUTO_TEST_CASE(test_translate_observations) {
       T_noisy_relation{{"dCounty", "dObs"}, true, EmissionSpec("bigram"), "County:state"}}};
 
   std::map<std::string, std::vector<std::string>> annotated_domains_for_relations;
-  annotated_domains_for_relations["Room Type"] = {"county:County", "Obs"};
-  annotated_domains_for_relations["Monthly Rent"] = {"county:County", "Obs"};
+  annotated_domains_for_relations["Room Type"] = {"Obs"};
+  annotated_domains_for_relations["Monthly Rent"] = {"Obs"};
   annotated_domains_for_relations["County"] = {"county:County", "Obs"};
   annotated_domains_for_relations["State"] = {"county:County", "Obs"};
 

diff --git a/cxx/pclean/schema_helper.cc b/cxx/pclean/schema_helper.cc
@@ -47,15 +47,6 @@ void PCleanSchemaHelper::compute_domains_for(const std::string& name) {
   annotated_domains[name] = annotated_ds;
 }
 
-std::string make_prefix_path(
-    std::vector<std::string>& var_names, size_t index) {
-  std::string s;
-  for (size_t i = index; i < var_names.size(); ++i) {
-    s += var_names[i] + ":";
-  }
-  return s;
-}
-
 void PCleanSchemaHelper::make_relations_for_queryfield(
     const QueryField& f, const PCleanClass& record_class, T_schema* tschema,
     std::map<std::string, std::vector<std::string>>
@@ -108,30 +99,34 @@ void PCleanSchemaHelper::make_relations_for_queryfield(
   // Handle only_final_emissions == true.
   if (only_final_emissions) {
     std::vector<std::string> noisy_domains = domains[class_names.back()];
+    std::vector<std::string> adfr = annotated_domains[class_names.back()];
     for (int i = class_names.size() - 2; i >= 0; --i) {
       noisy_domains.push_back(class_names[i]);
+      for (size_t j = 0; j < adfr.size(); ++j) {
+        adfr[j] = var_names[i] + ":" + adfr[j];
+      }
+      adfr.push_back(class_names[i]);
     }
     T_noisy_relation tnr = get_emission_relation(
         std::get<ScalarVar>(last_var.spec),
         noisy_domains,
         base_relation_name);
     tnr.is_observed = true;
     (*tschema)[f.name] = tnr;
-    std::string path_prefix = make_prefix_path(var_names, 0);
-    std::vector<std::string> reordered_annotated_domains = reorder_domains(
-          annotated_domains[record_class.name],
-          annotated_domains[record_class.name],
-          path_prefix);
-    (*annotated_domains_for_relation)[f.name] = reordered_annotated_domains;
+    (*annotated_domains_for_relation)[f.name] = adfr;
     return;
   }
 
   // Handle only_final_emissions == false.
   std::string& previous_relation = base_relation_name;
   std::vector<std::string> current_domains = domains[class_names.back()];
+  std::vector<std::string> adfr = annotated_domains[class_names.back()];
   for (int i = f.class_path.size() - 2; i >= 0; --i) {
-    std::string path_prefix = make_prefix_path(var_names, i);
     current_domains.push_back(class_names[i]);
+    for (size_t j = 0; j < adfr.size(); ++j) {
+      adfr[j] = var_names[i] + ":" + adfr[j];
+    }
+    adfr.push_back(class_names[i]);
     T_noisy_relation tnr = get_emission_relation(
         std::get<ScalarVar>(last_var.spec),
         current_domains,
@@ -148,30 +143,8 @@ void PCleanSchemaHelper::make_relations_for_queryfield(
     }
     (*tschema)[rel_name] = tnr;
     previous_relation = rel_name;
-    std::vector<std::string> reordered_annotated_domains = reorder_domains(
-          annotated_domains[class_names[i]],
-          annotated_domains[class_names[i]],
-          path_prefix);
-    (*annotated_domains_for_relation)[rel_name] = reordered_annotated_domains;
-  }
-}
-
-std::vector<std::string> reorder_domains(
-    const std::vector<std::string>& original_domains,
-    const std::vector<std::string>& annotated_ds,
-    const std::string& prefix) {
-  std::vector<std::string> output_domains;
-  for (size_t i = 0; i < original_domains.size(); ++i) {
-    if (annotated_ds[i].starts_with(prefix)) {
-      output_domains.push_back(original_domains[i]);
-    }
-  }
-  for (size_t i = 0; i < original_domains.size(); ++i) {
-    if (!annotated_ds[i].starts_with(prefix)) {
-      output_domains.push_back(original_domains[i]);
-    }
+    (*annotated_domains_for_relation)[rel_name] = adfr;
   }
-  return output_domains;
 }
 
 T_schema PCleanSchemaHelper::make_hirm_schema(

diff --git a/cxx/pclean/schema_helper.hh b/cxx/pclean/schema_helper.hh
@@ -44,10 +44,3 @@ class PCleanSchemaHelper {
   std::map<std::string, std::vector<std::string>> domains;
   std::map<std::string, std::vector<std::string>> annotated_domains;
 };
-
-// Returns original_domains, but with the elements corresponding to
-// annotated_ds elements that start with prefix moved to the front.
-std::vector<std::string> reorder_domains(
-    const std::vector<std::string>& original_domains,
-    const std::vector<std::string>& annotated_ds,
-    const std::string& prefix);
diff --git a/cxx/pclean/schema_helper_test.cc b/cxx/pclean/schema_helper_test.cc
@@ -158,8 +158,7 @@ BOOST_AUTO_TEST_CASE(test_make_relations_for_queryfield) {
   BOOST_TEST(!std::get<T_noisy_relation>(tschema["Physician::School"]).is_observed);
 
   std::vector<std::string> expected_adfr = {
-    "physician:school:School", "location:city:City",
-    "location:Practice", "physician:Physician", "Record"};
+    "physician:school:School", "physician:Physician", "Record"};
   BOOST_TEST(annotated_domains_for_relation["School"] == expected_adfr,
              tt::per_element());
 }
@@ -336,31 +335,6 @@ BOOST_AUTO_TEST_CASE(test_make_hirm_schema_only_final_emissions) {
   BOOST_TEST(nr5.domains == expected_domains, tt::per_element());
 }
 
-BOOST_AUTO_TEST_CASE(test_reorder_domains) {
-  std::vector<std::string> origs = {"0", "1", "2", "3", "4", "5", "6", "7"};
-  std::vector<std::string> annotated = {
-    "000", "001", "010", "011", "100", "101", "110", "111"};
-
-  std::vector<std::string> expected = {
-    "6", "7", "0", "1", "2", "3", "4", "5"};
-  BOOST_TEST(reorder_domains(origs, annotated, "11") == expected);
-
-  expected = {"4", "5", "6", "7", "0", "1", "2", "3"};
-  BOOST_TEST(reorder_domains(origs, annotated, "1") == expected);
-
-  origs = {
-    "republic_of_ireland", "northern_ireland", "england", "scotland", "wales"};
-  annotated = {
-    "ireland:republic_of_ireland",
-    "ireland:uk:northern_ireland",
-    "great_britain:uk:england",
-    "great_britain:uk:scotland",
-    "great_britain:uk:wales"};
-  expected = {
-    "northern_ireland", "republic_of_ireland", "england", "scotland", "wales"};
-  BOOST_TEST(reorder_domains(origs, annotated, "ireland:uk:") == expected);
-}
-
 BOOST_AUTO_TEST_CASE(test_record_class_is_clean) {
   std::stringstream ss2(R"""(
 class Record