Fix importing full ThermoML archive (#402)

openforcefield · Nov 13, 2021 · 9f6e834 · 9f6e834
1 parent c045892
commit 9f6e834
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 38 deletions.
diff --git a/docs/datasets/curation.rst b/docs/datasets/curation.rst
@@ -123,9 +123,7 @@ Data Extraction
     )
 
     # Import all data collected from the IJT journal.
-    data_frame = ImportThermoMLData.apply(
-        pandas.DataFrame(), ImportThermoMLDataSchema(journal_names=["IJT"])
-    )
+    data_frame = ImportThermoMLData.apply(pandas.DataFrame(), ImportThermoMLDataSchema())
 
 Filtration
 """"""""""

diff --git a/docs/releasehistory.rst b/docs/releasehistory.rst
@@ -8,6 +8,21 @@ Releases follow the ``major.minor.micro`` scheme recommended by
 * ``minor`` increments add features but do not break API compatibility
 * ``micro`` increments represent bugfix releases or improvements in documentation
 
+0.3.9
+-----
+
+Bugfixes
+""""""""
+
+* PR `#402 <https://github.com/openforcefield/openff-evaluator/pull/402>`_: Fix importing full ThermoML archive
+
+Behaviour Changes
+"""""""""""""""""
+
+The way that ThermoML archive files are served was changed in 2021 so that individual journal archives are no longer
+made available. Instead, now only the full ThermoML archive can be downloaded. Because of this, the ``ImportThermoMLDataSchema``
+schema no longer allows users to select which journal to pull data from.
+
 0.3.8
 -----
 

diff --git a/openff/evaluator/datasets/curation/components/thermoml.py b/openff/evaluator/datasets/curation/components/thermoml.py
@@ -4,7 +4,7 @@
 import os
 import tarfile
 from multiprocessing import Pool
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import pandas
 import requests
@@ -21,10 +21,6 @@
 logger = logging.getLogger(__name__)
 
 
-def _default_journals():
-    return ["JCED", "JCT", "FPE", "TCA", "IJT"]
-
-
 class ImportThermoMLDataSchema(CurationComponentSchema):
 
     type: Literal["ImportThermoMLData"] = "ImportThermoMLData"
@@ -41,13 +37,10 @@ class ImportThermoMLDataSchema(CurationComponentSchema):
         "into, and to restore the output of this component from.",
     )
 
-    journal_names: List[Literal["JCED", "JCT", "FPE", "TCA", "IJT"]] = Field(
-        default_factory=_default_journals,
-        description="The abbreviated names of the journals to import data from.",
-    )
     root_archive_url: HttpUrl = Field(
-        default="https://trc.nist.gov/ThermoML",
-        description="The root url where the ThermoML archives can be downloaded from.",
+        default="https://data.nist.gov/od/ds/mds2-2422/ThermoML.v2020-09-30.tgz",
+        description="The root url where the main ThermoML archive can be downloaded "
+        "from.",
     )
 
 
@@ -59,23 +52,19 @@ class ImportThermoMLData(CurationComponent):
     @classmethod
     def _download_data(cls, schema: ImportThermoMLDataSchema):
 
-        for journal in schema.journal_names:
+        # Download the archive of all properties from the journal.
+        request = requests.get(schema.root_archive_url, stream=True)
 
-            # Download the archive of all properties from the journal.
-            request = requests.get(
-                f"{schema.root_archive_url}/{journal}.tgz", stream=True
-            )
-
-            # Make sure the request went ok.
-            try:
-                request.raise_for_status()
-            except requests.exceptions.HTTPError as error:
-                print(error.response.text)
-                raise
+        # Make sure the request went ok.
+        try:
+            request.raise_for_status()
+        except requests.exceptions.HTTPError as error:
+            print(error.response.text)
+            raise
 
-                # Unzip the files into the temporary directory.
-            tar_file = tarfile.open(fileobj=io.BytesIO(request.content))
-            tar_file.extractall()
+            # Unzip the files into the temporary directory.
+        tar_file = tarfile.open(fileobj=io.BytesIO(request.content))
+        tar_file.extractall()
 
     @classmethod
     def _process_archive(cls, file_path: str) -> pandas.DataFrame:
@@ -124,7 +113,7 @@ def _apply(
             cls._download_data(schema)
 
             # Get the names of the extracted files
-            file_names = glob.glob("*.xml")
+            file_names = glob.glob(os.path.join("10.*", "*.xml"))
 
             logger.debug("Processing archives")
 

diff --git a/openff/evaluator/tests/test_datasets/test_curation/test_thermoml.py b/openff/evaluator/tests/test_datasets/test_curation/test_thermoml.py
@@ -18,19 +18,20 @@ def test_import_thermoml_data(requests_mock):
     # Create a tarball to be downloaded.
     source_path = get_data_filename(os.path.join("test", "properties", "mass.xml"))
 
-    with NamedTemporaryFile(suffix="tgz") as tar_file:
+    with NamedTemporaryFile(suffix=".tgz") as tar_file:
 
         with tarfile.open(tar_file.name, "w:gz") as tar:
-            tar.add(source_path, arcname=os.path.basename(source_path))
+            tar.add(
+                source_path,
+                arcname=os.path.join("10.1021", os.path.basename(source_path)),
+            )
+
+        schema = ImportThermoMLDataSchema()
 
         with open(tar_file.name, "rb") as file:
 
-            requests_mock.get(
-                "https://trc.nist.gov/ThermoML/IJT.tgz", content=file.read()
-            )
+            requests_mock.get(schema.root_archive_url, content=file.read())
 
-        data_frame = ImportThermoMLData.apply(
-            pandas.DataFrame(), ImportThermoMLDataSchema(journal_names=["IJT"])
-        )
+        data_frame = ImportThermoMLData.apply(pandas.DataFrame(), schema)
 
         assert data_frame is not None and len(data_frame) == 1