Skip to content

Commit

Permalink
Fix importing full ThermoML archive (#402)
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonBoothroyd authored Nov 13, 2021
1 parent c045892 commit 9f6e834
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 38 deletions.
4 changes: 1 addition & 3 deletions docs/datasets/curation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,7 @@ Data Extraction
)

# Import all data collected from the IJT journal.
data_frame = ImportThermoMLData.apply(
pandas.DataFrame(), ImportThermoMLDataSchema(journal_names=["IJT"])
)
data_frame = ImportThermoMLData.apply(pandas.DataFrame(), ImportThermoMLDataSchema())

Filtration
""""""""""
Expand Down
15 changes: 15 additions & 0 deletions docs/releasehistory.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,21 @@ Releases follow the ``major.minor.micro`` scheme recommended by
* ``minor`` increments add features but do not break API compatibility
* ``micro`` increments represent bugfix releases or improvements in documentation

0.3.9
-----

Bugfixes
""""""""

* PR `#402 <https://github.com/openforcefield/openff-evaluator/pull/402>`_: Fix importing full ThermoML archive

Behaviour Changes
"""""""""""""""""

The way that ThermoML archive files are served was changed in 2021 so that individual journal archives are no longer
made available. Instead, now only the full ThermoML archive can be downloaded. Because of this, the ``ImportThermoMLDataSchema``
schema no longer allows users to select which journal to pull data from.

0.3.8
-----

Expand Down
43 changes: 16 additions & 27 deletions openff/evaluator/datasets/curation/components/thermoml.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import tarfile
from multiprocessing import Pool
from typing import List, Optional, Union
from typing import Optional, Union

import pandas
import requests
Expand All @@ -21,10 +21,6 @@
logger = logging.getLogger(__name__)


def _default_journals():
return ["JCED", "JCT", "FPE", "TCA", "IJT"]


class ImportThermoMLDataSchema(CurationComponentSchema):

type: Literal["ImportThermoMLData"] = "ImportThermoMLData"
Expand All @@ -41,13 +37,10 @@ class ImportThermoMLDataSchema(CurationComponentSchema):
"into, and to restore the output of this component from.",
)

journal_names: List[Literal["JCED", "JCT", "FPE", "TCA", "IJT"]] = Field(
default_factory=_default_journals,
description="The abbreviated names of the journals to import data from.",
)
root_archive_url: HttpUrl = Field(
default="https://trc.nist.gov/ThermoML",
description="The root url where the ThermoML archives can be downloaded from.",
default="https://data.nist.gov/od/ds/mds2-2422/ThermoML.v2020-09-30.tgz",
description="The root url where the main ThermoML archive can be downloaded "
"from.",
)


Expand All @@ -59,23 +52,19 @@ class ImportThermoMLData(CurationComponent):
@classmethod
def _download_data(cls, schema: ImportThermoMLDataSchema):

for journal in schema.journal_names:
# Download the archive of all properties from the journal.
request = requests.get(schema.root_archive_url, stream=True)

# Download the archive of all properties from the journal.
request = requests.get(
f"{schema.root_archive_url}/{journal}.tgz", stream=True
)

# Make sure the request went ok.
try:
request.raise_for_status()
except requests.exceptions.HTTPError as error:
print(error.response.text)
raise
# Make sure the request went ok.
try:
request.raise_for_status()
except requests.exceptions.HTTPError as error:
print(error.response.text)
raise

# Unzip the files into the temporary directory.
tar_file = tarfile.open(fileobj=io.BytesIO(request.content))
tar_file.extractall()
# Unzip the files into the temporary directory.
tar_file = tarfile.open(fileobj=io.BytesIO(request.content))
tar_file.extractall()

@classmethod
def _process_archive(cls, file_path: str) -> pandas.DataFrame:
Expand Down Expand Up @@ -124,7 +113,7 @@ def _apply(
cls._download_data(schema)

# Get the names of the extracted files
file_names = glob.glob("*.xml")
file_names = glob.glob(os.path.join("10.*", "*.xml"))

logger.debug("Processing archives")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,20 @@ def test_import_thermoml_data(requests_mock):
# Create a tarball to be downloaded.
source_path = get_data_filename(os.path.join("test", "properties", "mass.xml"))

with NamedTemporaryFile(suffix="tgz") as tar_file:
with NamedTemporaryFile(suffix=".tgz") as tar_file:

with tarfile.open(tar_file.name, "w:gz") as tar:
tar.add(source_path, arcname=os.path.basename(source_path))
tar.add(
source_path,
arcname=os.path.join("10.1021", os.path.basename(source_path)),
)

schema = ImportThermoMLDataSchema()

with open(tar_file.name, "rb") as file:

requests_mock.get(
"https://trc.nist.gov/ThermoML/IJT.tgz", content=file.read()
)
requests_mock.get(schema.root_archive_url, content=file.read())

data_frame = ImportThermoMLData.apply(
pandas.DataFrame(), ImportThermoMLDataSchema(journal_names=["IJT"])
)
data_frame = ImportThermoMLData.apply(pandas.DataFrame(), schema)

assert data_frame is not None and len(data_frame) == 1

0 comments on commit 9f6e834

Please sign in to comment.