Skip to content

Commit

Permalink
cif::Ddl: more checks, small modifications to messages
Browse files Browse the repository at this point in the history
previously, the content of save-frames was skipped; now it's validated
in the same way as blocks
  • Loading branch information
wojdyr committed Jan 4, 2025
1 parent 24ca410 commit 15ab68d
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 99 deletions.
6 changes: 5 additions & 1 deletion include/gemmi/ddl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

#include <map>
#include <memory> // for unique_ptr
#include <ostream>
#include <regex>
#include "cifdoc.hpp" // for cif::Document
#include "logger.hpp" // for Logger
Expand Down Expand Up @@ -41,9 +40,12 @@ struct GEMMI_DLL Ddl {
void read_ddl(cif::Document&& doc);

bool validate_cif(const cif::Document& doc) const;
bool validate_block(const cif::Block& b, const std::string& source) const;

void check_audit_conform(const cif::Document& doc) const;

const std::map<std::string, std::regex>& regexes() const { return regexes_; }

private:
// items from DDL2 _pdbx_item_linked_group[_list]
struct ParentLink {
Expand All @@ -58,6 +60,8 @@ struct GEMMI_DLL Ddl {
std::vector<ParentLink> parents_;
// storage for DDL2 _item_linked.child_name -> _item_linked.parent_name
std::map<std::string, std::string> item_parents_;
// counter that allows to limit the number of errors
mutable int missing_category_key_errors = 0;

cif::Block* find_rules(const std::string& name) const {
auto iter = name_index_.find(to_lower(name));
Expand Down
202 changes: 104 additions & 98 deletions src/ddl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ std::string row_as_string(cif::Table::Row row) {
});
}

class Validator1 {
class Ddl1Rules {
public:
Validator1(cif::Block& b) {
Ddl1Rules(cif::Block& b) {
if (const std::string* list = b.find_value("_list")) {
if (*list == "yes")
is_list_ = Trinary::Yes;
Expand Down Expand Up @@ -136,22 +136,23 @@ class Validator1 {
};


class Validator2 {
class Ddl2Rules {
public:
enum class Type : char { Unset, Int, Float };
enum class ItemContext { Default, Local, Deprecated };

Validator2(cif::Block& b, const std::map<std::string, std::regex>& regexes) {
Ddl2Rules(cif::Block& b, const Ddl* ddl, const std::string& tag) {
if (const std::string* code = b.find_value("_item_type.code")) {
type_code_ = cif::as_string(*code);
if (type_code_ == "float") {
type_ = Type::Float;
} else if (type_code_ == "int") {
type_ = Type::Int;
} else { // to make it faster, we don't use regex for int and float
auto it = regexes.find(*code);
if (it != regexes.end())
auto it = ddl->regexes().find(type_code_);
if (it != ddl->regexes().end())
re_ = &it->second;
else
ddl->logger.mesg("Bad DDL2: ", tag, " has undefined type: ", type_code_);
}
}
for (auto row : b.find("_item_range.", {"minimum", "maximum"}))
Expand All @@ -166,12 +167,6 @@ class Validator2 {
associated_value_ = row.str(0);
}
*/
if (const std::string* context = b.find_value("_pdbx_item_context.type")) {
if (*context == "WWPDB_LOCAL")
context_ = ItemContext::Local;
else if (*context == "WWPDB_DEPRECATED")
context_ = ItemContext::Deprecated;
}
}

// takes raw value
Expand Down Expand Up @@ -229,22 +224,9 @@ class Validator2 {
return false;
}

bool check_context_type(std::string* msg) const {
if (context_ == ItemContext::Deprecated) {
*msg = " is deprecated";
return false;
}
if (context_ == ItemContext::Local) {
*msg = " is for pdb internal use";
return false;
}
return true;
}

private:
Type type_ = Type::Unset;
bool icase_ = false;
ItemContext context_ = ItemContext::Default;
std::vector<std::string> enumeration_;
std::string type_code_;
std::vector<std::pair<double, double>> range_;
Expand All @@ -269,7 +251,7 @@ void Ddl::check_audit_conform(const cif::Document& doc) const {
std::string version = cif::as_string(*dict_ver);
if (version != dict_version) {
auto major = [](const std::string& s) { return s.substr(0, s.find('.')); };
if (logger.threshold >= 7 || major(version) == major(dict_version))
if (logger.threshold >= 7 || major(version) != major(dict_version))
logger.note(br(b), "conforms to ", name, " ver. ", version,
" while DDL has ver. ", dict_version);
}
Expand Down Expand Up @@ -310,8 +292,15 @@ void Ddl::check_mandatory_items(const cif::Block& b) const {
std::string key = cif::as_string(v);
if (!gemmi::istarts_with(key, cat.first))
logger.level<3>("inconsistent dictionary: wrong _category_key for ", cat_name);
if (!gemmi::in_vector(to_lower(key.substr(n)), cat.second))
warn(b, "missing category key: ", key);
if (!gemmi::in_vector(to_lower(key.substr(n)), cat.second)) {
// In mmcif_pdbx_v50.dic a category key is missing so often,
// that this generates about 20,000 errors, resulting in too much text.
if (missing_category_key_errors < 20 || logger.threshold >= 7)
warn(b, "missing category key: ", key);
else if (missing_category_key_errors == 20)
logger.level<3>("(Increase verbosity to show all missing-category-key errors.)");
missing_category_key_errors++;
}
}
// check mandatory items
for (auto i = name_index_.lower_bound(cat.first);
Expand Down Expand Up @@ -354,8 +343,9 @@ void Ddl::check_unique_keys_in_loop(const cif::Loop& loop, const Block& block) c
}
}
if (dup_counter != 0) {
warn(block, "category ", cat_name, " has ", dup_counter, "duplicated key(s):\n 1st dup:",
gemmi::join_str(key_positions, " and ", [&](int k) {
warn(block, "category ", cat_name, " has ", dup_counter, " duplicated key",
dup_counter == 1 ? ":\n " : "s, first one:\n ",
gemmi::join_str(key_positions, " + ", [&](int k) {
return gemmi::cat(loop.tags[k].substr(dot_pos+1), '=', loop.values[dup_row + k]);
}));
}
Expand Down Expand Up @@ -476,10 +466,6 @@ void Ddl::read_ddl2_block(cif::Block& block) {

if (use_regex)
for (auto row : block.find("_item_type_list.", {"code", "construct"})) {
if (cif::is_text_field(row[1]))
// text field is problematic, but it's used only for "binary"
// which in turn is never used
continue;
try {
std::string re_str = row.str(1);
// mmcif_pdbx_v50.dic uses custom flavour of regex:
Expand All @@ -491,8 +477,9 @@ void Ddl::read_ddl2_block(cif::Block& block) {
auto flag = std::regex::awk | std::regex::optimize;
regexes_.emplace(row.str(0), std::regex(re_str, flag));
} catch (const std::regex_error& e) {
logger.note("Ddl has invalid regex for ", row[0], ":\n ",
row.str(1), "\n ", e.what(), '\n');
logger.mesg("Bad DDL2: can't parse regex for '", row[0], "': ", e.what());
// add an always-matching placeholder to avoid errors later
regexes_.emplace(row.str(0), std::regex(".*"));
}
}

Expand Down Expand Up @@ -554,82 +541,101 @@ void Ddl::read_ddl2_block(cif::Block& block) {
}
}

static const char* wrong_ddl2_context(const cif::Block& dict_block) {
const std::string* context = dict_block.find_value("_pdbx_item_context.type");
if (context && *context == "WWPDB_LOCAL")
return " is for pdb internal use";
if (context && *context == "WWPDB_DEPRECATED")
return " is deprecated";
return nullptr;
}

bool Ddl::validate_cif(const cif::Document& doc) const {
std::string msg;
bool ok = true;
auto err = [&](const cif::Block& b, const cif::Item& item, const std::string& s) {
for (const cif::Block& b : doc.blocks)
if (!validate_block(b, doc.source))
ok = false;
return ok;
}

bool Ddl::validate_block(const cif::Block& b, const std::string& source) const {
bool ok = true;
std::string msg;
auto err = [&](const cif::Item& item, const std::string& s) {
ok = false;
logger.level<3>(doc.source, ':', item.line_number, " [", b.name, "] ", s);
logger.level<3>(source, ':', item.line_number, " [", b.name, "] ", s);
};

for (const cif::Block& b : doc.blocks) {
for (const cif::Item& item : b.items) {
if (item.type == cif::ItemType::Pair) {
cif::Block* dict_block = find_rules(item.pair[0]);
for (const cif::Item& item : b.items) {
if (item.type == cif::ItemType::Pair) {
const std::string& tag = item.pair[0];
cif::Block* dict_block = find_rules(tag);
if (!dict_block) {
if (print_unknown_tags)
warn(b, "unknown tag ", tag);
continue;
}
// validate pair
if (major_version == 1) {
Ddl1Rules rules(*dict_block);
if (rules.is_list() == Trinary::Yes)
err(item, tag + " must be a list");
if (!rules.validate_value(item.pair[1], &msg))
err(item, msg);
} else {
if (use_context)
if (const char* bad_ctx = wrong_ddl2_context(*dict_block))
err(item, tag + bad_ctx);
Ddl2Rules rules(*dict_block, this, tag);
if (!rules.validate_value(item.pair[1], &msg))
err(item, msg);
}
} else if (item.type == cif::ItemType::Loop) {
const size_t ncol = item.loop.tags.size();
for (size_t i = 0; i != ncol; i++) {
const std::string& tag = item.loop.tags[i];
cif::Block* dict_block = find_rules(tag);
if (!dict_block) {
if (print_unknown_tags)
warn(b, "unknown tag ", item.pair[0]);
warn(b, "unknown tag ", tag);
continue;
}
// validate pair
// validate column in loop
if (major_version == 1) {
Validator1 tc(*dict_block);
if (tc.is_list() == Trinary::Yes)
err(b, item, item.pair[0] + " must be a list");
if (!tc.validate_value(item.pair[1], &msg))
err(b, item, msg);
Ddl1Rules rules(*dict_block);
if (rules.is_list() == Trinary::No)
err(item, tag + " in list");
for (size_t j = i; j < item.loop.values.size(); j += ncol)
if (!rules.validate_value(item.loop.values[j], &msg)) {
err(item, cat(tag, ": ", msg));
break; // stop after first error to avoid clutter
}
} else {
Validator2 tc(*dict_block, regexes_);
if (use_context && !tc.check_context_type(&msg))
err(b, item, item.pair[0] + msg);
if (!tc.validate_value(item.pair[1], &msg))
err(b, item, msg);
}
} else if (item.type == cif::ItemType::Loop) {
const size_t ncol = item.loop.tags.size();
for (size_t i = 0; i != ncol; i++) {
const std::string& tag = item.loop.tags[i];
cif::Block* dict_block = find_rules(tag);
if (!dict_block) {
if (print_unknown_tags)
warn(b, "unknown tag ", tag);
continue;
}
// validate column in loop
if (major_version == 1) {
Validator1 tc(*dict_block);
if (tc.is_list() == Trinary::No)
err(b, item, tag + " in list");
for (size_t j = i; j < item.loop.values.size(); j += ncol)
if (!tc.validate_value(item.loop.values[j], &msg)) {
err(b, item, cat(tag, ": ", msg));
break; // stop after first error to avoid clutter
}
} else {
Validator2 tc(*dict_block, regexes_);
if (use_context && !tc.check_context_type(&msg))
err(b, item, tag + msg);
for (size_t j = i; j < item.loop.values.size(); j += ncol)
if (!tc.validate_value(item.loop.values[j], &msg)) {
err(b, item, cat(tag, ": ", msg));
break; // stop after first error to avoid clutter
}
}
if (use_context)
if (const char* bad_ctx = wrong_ddl2_context(*dict_block))
err(item, tag + bad_ctx);
Ddl2Rules rules(*dict_block, this, tag);
for (size_t j = i; j < item.loop.values.size(); j += ncol)
if (!rules.validate_value(item.loop.values[j], &msg)) {
err(item, cat(tag, ": ", msg));
break; // stop after first error to avoid clutter
}
}
}
} else if (item.type == cif::ItemType::Frame) {
validate_block(item.frame, source);
}
}

if (major_version == 2) {
if (use_mandatory)
check_mandatory_items(b);
if (use_unique_keys) {
for (const cif::Item& item : b.items)
if (item.type == cif::ItemType::Loop)
check_unique_keys_in_loop(item.loop, b);
}
if (use_parents)
check_parents(b);
if (major_version == 2) {
if (use_mandatory)
check_mandatory_items(b);
if (use_unique_keys) {
for (const cif::Item& item : b.items)
if (item.type == cif::ItemType::Loop)
check_unique_keys_in_loop(item.loop, b);
}
if (use_parents)
check_parents(b);
}

return ok;
Expand Down

0 comments on commit 15ab68d

Please sign in to comment.