Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use get to avoid KeyError on missing cmiles attribute #300

Merged
merged 14 commits into from
Oct 21, 2024
31 changes: 31 additions & 0 deletions openff/qcsubmit/_tests/results/test_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
"""

import datetime
import logging
from collections import defaultdict
from tempfile import TemporaryDirectory

import pytest
Expand Down Expand Up @@ -464,3 +466,32 @@ def test_torsion_smirnoff_coverage(public_client, monkeypatch):
assert {*coverage["Bonds"].values()} == {3}
assert {*coverage["Angles"].values()} == {3}
assert {*coverage["ProperTorsions"].values()} == {1, 3}


def test_missing_cmiles_basic_result_collection(public_client, caplog):
"""Some older datasets don't have CMILES in the single-point records. As
reported in #299, this would cause a KeyError when retrieving these
datasets. Such entries should now be skipped, but this can lead to empty
datasets, so we also print a warning for each missing CMILES.
"""
with caplog.at_level(logging.INFO):
basic_collection = BasicResultCollection.from_server(
public_client,
["OpenFF Gen 2 Opt Set 1 Roche"],
spec_name="spec_1",
)

logs = defaultdict(int)
for record in caplog.records:
logs[record.levelname] += 1

# should be 298 of these at the INFO level
assert logs["INFO"] == 298
assert "MISSING CMILES" in caplog.text

# should be 1 of these at the end at the default WARNING level
assert logs["WARNING"] == 1
assert "Missing 298/298" in caplog.text

# no results because they are all missing CMILES
assert basic_collection.n_results == 0
30 changes: 24 additions & 6 deletions openff/qcsubmit/results/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from __future__ import annotations

import abc
import logging
import warnings
from collections import defaultdict
from typing import (
Expand Down Expand Up @@ -49,6 +50,9 @@
T = TypeVar("T")
S = TypeVar("S")

logging.basicConfig()
logger = logging.getLogger(__name__)


class _BaseResult(BaseModel, abc.ABC):
"""The base model for storing information about a QC record generated by
Expand Down Expand Up @@ -295,6 +299,8 @@ def from_datasets(

result_records = defaultdict(dict)

missing_cmiles = 0
total_entries = 0
for dataset in datasets:
client = dataset._client

Expand All @@ -310,6 +316,7 @@ def from_datasets(
for entry_name, spec_name, record in dataset.iterate_records(
specification_names=spec_name, status=RecordStatusEnum.complete
):
total_entries += 1
entry = dataset.get_entry(entry_name)
molecule = entry.molecule

Expand All @@ -321,11 +328,12 @@ def from_datasets(
"canonical_isomeric_explicit_hydrogen_mapped_smiles"
)
if not cmiles:
cmiles = entry.attributes[
cmiles = entry.attributes.get(
"canonical_isomeric_explicit_hydrogen_mapped_smiles"
]
)
if not cmiles:
print(f"MISSING CMILES! entry = {entry_name}")
logger.info(f"MISSING CMILES! entry = {entry_name}")
missing_cmiles += 1
continue

inchi_key = entry.attributes.get("fixed_hydrogen_inchi_key")
Expand All @@ -344,6 +352,16 @@ def from_datasets(
)
result_records[client.address][record.id] = br

if missing_cmiles > 0:
logger.warning(
f"Missing {missing_cmiles}/{total_entries} CMILES. "
"Some legacy datasets may only have CMILES in their "
"optimization datasets (often with the same name as the "
"singlepoint dataset). "
"See OptimizationResultCollection.to_basic_result_collection "
"for a way to convert to a BasicResultCollection."
)

return cls(
entries={
address: [*records.values()]
Expand Down Expand Up @@ -470,11 +488,11 @@ def from_datasets(
"canonical_isomeric_explicit_hydrogen_mapped_smiles"
)
if not cmiles:
cmiles = entry.attributes[
cmiles = entry.attributes.get(
"canonical_isomeric_explicit_hydrogen_mapped_smiles"
]
)
if not cmiles:
print(f"MISSING CMILES! entry = {entry_name}")
logger.info(f"MISSING CMILES! entry = {entry_name}")
continue

inchi_key = entry.attributes.get("fixed_hydrogen_inchi_key")
Expand Down
Loading