create test data for quickrun submitted in parallel (#1040)

OpenFreeEnergy · Dec 9, 2024 · 55c9924 · 55c9924
1 parent c2e4ee9
commit 55c9924
Show file tree

Hide file tree

Showing 3 changed files with 260 additions and 4 deletions.
diff --git a/openfecli/tests/commands/test_gather.py b/openfecli/tests/commands/test_gather.py
@@ -28,14 +28,25 @@ def test_get_column(val, col):
 
 
 @pytest.fixture
-def results_dir(tmpdir):
+def results_dir_serial(tmpdir):
+    """Example output data, with replicates run in serial (3 replicates per results JSON)."""
     with tmpdir.as_cwd():
         with resources.files('openfecli.tests.data') as d:
             t = tarfile.open(d / 'rbfe_results.tar.gz', mode='r')
             t.extractall('.')
 
         yield
 
+@pytest.fixture
+def results_dir_parallel(tmpdir):
+    """Identical data to results_dir_serial(), with replicates run in parallel (1 replicate per results JSON)."""
+    with tmpdir.as_cwd():
+        with resources.files('openfecli.tests.data') as d:
+            t = tarfile.open(d / 'results_parallel.tar.gz', mode='r')
+            t.extractall('.')
+
+        yield
+
 _EXPECTED_DG = b"""
 ligand	DG(MLE) (kcal/mol)	uncertainty (kcal/mol)
 lig_ejm_31	-0.09	0.05
@@ -146,7 +157,7 @@ def results_dir(tmpdir):
 
 
 @pytest.mark.parametrize('report', ["", "dg", "ddg", "raw"])
-def test_gather(results_dir, report):
+def test_gather(results_dir_serial, report):
     expected = {
         "": _EXPECTED_DG,
         "dg": _EXPECTED_DG,
@@ -185,7 +196,7 @@ def test_generate_bad_legs_error_message(include):
 
 
 @pytest.mark.xfail
-def test_missing_leg_error(results_dir):
+def test_missing_leg_error(results_dir_serial):
     file_to_remove = "easy_rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json"
     (pathlib.Path("results") / file_to_remove).unlink()
 
@@ -199,7 +210,7 @@ def test_missing_leg_error(results_dir):
 
 
 @pytest.mark.xfail
-def test_missing_leg_allow_partial(results_dir):
+def test_missing_leg_allow_partial(results_dir_serial):
     file_to_remove = "easy_rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json"
     (pathlib.Path("results") / file_to_remove).unlink()
 

diff --git a/openfecli/tests/data/restructure_results_data.ipynb b/openfecli/tests/data/restructure_results_data.ipynb
@@ -0,0 +1,245 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d1899bc-337a-4024-9fa3-9cfbc452e091",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json \n",
+    "from gufe.tokenization import JSON_HANDLER\n",
+    "import numpy as np\n",
+    "import os \n",
+    "import shutil\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a82b8123-521a-4ca3-a2cf-f73b6504fa14",
+   "metadata": {},
+   "source": [
+    "for this dataset, we know we have 3 replicates run in serial for each leg. We want to manipulate the data so that it is equivalent to the output if we re-ran this dataset with each leg run in parallel, with the following directory structure:\n",
+    "\n",
+    "```\n",
+    "results/\n",
+    "  transformations_0/\n",
+    "      rbfe_lig_ejm_31_complex_lig_ejm_42_complex/\n",
+    "          shared_[hashA]_attempt_0/\n",
+    "      rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json\n",
+    "  transformations_1/\n",
+    "      rbfe_lig_ejm_31_complex_lig_ejm_42_complex/\n",
+    "          shared_[hashB]_attempt_0/\n",
+    "      rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json\n",
+    "  transformations_2/\n",
+    "      rbfe_lig_ejm_31_complex_lig_ejm_42_complex/\n",
+    "          shared_[hashC]_attempt_0/\n",
+    "      rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c6ed7fe-b42c-4781-b356-85799e25356f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_json(fpath):\n",
+    "    return json.load(open(fpath, 'r'), cls=JSON_HANDLER.decoder)\n",
+    "\n",
+    "def dump_json(data, fpath):\n",
+    "    with open(fpath, \"w\") as f:\n",
+    "        json.dump(data, f, cls=JSON_HANDLER.encoder)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8eba246a-6123-4d8e-8fd8-2de516fbf881",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "orig_dir = Path(\"results/\")\n",
+    "new_dir = Path(\"results_parallel/\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab4f2587-9b15-422d-9faa-e11ff98fd491",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "leg_names = []\n",
+    "for name in os.listdir(orig_dir):\n",
+    "    if name.endswith(\".json\"):\n",
+    "        continue\n",
+    "    leg_names.append(name)\n",
+    "leg_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "311a7f0e-9c91-47ae-9e09-0e1bef03aca8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! rm -rf $new_dir\n",
+    "for leg in leg_names:\n",
+    "    json_data = load_json(orig_dir/f\"{leg}.json\")\n",
+    "    srckey_to_protocol = {}\n",
+    "    srckey_to_unit_results = {}\n",
+    "    srckey_to_estimate = {}\n",
+    "    ## collect results on a per-replicate basis\n",
+    "    for k in json_data['protocol_result']['data']:    \n",
+    "        rep_source_key = json_data['protocol_result']['data'][k][0]['source_key']\n",
+    "        \n",
+    "        # keep only the data for this replicate\n",
+    "        rep_result = json_data['protocol_result'].copy()\n",
+    "        rep_result['data']={k:json_data['protocol_result']['data'][k]}\n",
+    "        srckey_to_protocol[rep_source_key] = rep_result\n",
+    "\n",
+    "        # pull just the estimate value so we can put it at the top of the output\n",
+    "        srckey_to_estimate[rep_source_key] = rep_result['data'][k][0]['outputs']['unit_estimate']\n",
+    "        \n",
+    "    for k in json_data['unit_results']:\n",
+    "        rep_source_key = json_data['unit_results'][k]['source_key']\n",
+    "\n",
+    "        rep_unit_result = json_data['unit_results'].copy()\n",
+    "        rep_unit_result = {k: json_data['unit_results'][k]}\n",
+    "        srckey_to_unit_results[rep_source_key] = rep_unit_result\n",
+    "    \n",
+    "    assert srckey_to_protocol.keys() == srckey_to_unit_results.keys()\n",
+    "    \n",
+    "    ## write to the new directory\n",
+    "    for n, sk in enumerate(sorted(srckey_to_protocol.keys())):\n",
+    "        rep_dir = new_dir/f\"replicate_{n}\"\n",
+    "        os.makedirs(rep_dir/leg)\n",
+    "    \n",
+    "        # build up the data for this replicate\n",
+    "        replicate_data = {'estimate': srckey_to_estimate[sk],\n",
+    "                          'uncertainty': np.std(srckey_to_estimate[sk]),\n",
+    "                          'protocol_result': srckey_to_protocol[sk],\n",
+    "                          'unit_results': srckey_to_unit_results[sk]}\n",
+    "    \n",
+    "        # write!\n",
+    "        dump_json(replicate_data, rep_dir/f\"{leg}.json\")\n",
+    "        working_dir_name = f\"shared_{sk}_attempt_0\"\n",
+    "        ## TODO: make this work for arbitrary number of attempts \n",
+    "        # os.symlink(orig_dir/leg/working_dir_name, rep_dir/leg/working_dir_name)\n",
+    "        shutil.copytree(orig_dir/leg/working_dir_name, rep_dir/leg/working_dir_name)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f864dcb3-bebf-425b-9154-bffc2b0e3f07",
+   "metadata": {},
+   "source": [
+    "## check that objects reload correctly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c20639c-8ba7-457a-bf8a-76c64aef4a38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openfe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cba8316-5500-4d5e-a84d-d72d09ba2a42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json_reloaded = load_json(\"results_parallel/replicate_0/easy_rbfe_lig_ejm_31_solvent_lig_ejm_47_solvent.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0e90b45-ae83-41c1-8748-0a8c1466b378",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json_reloaded['estimate'], json_reloaded['uncertainty']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0ce2bc6-d960-4521-b71c-316be0557e9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pr_reloaded = openfe.ProtocolResult.from_dict(json_reloaded['protocol_result'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2fbb695-d4ef-45bd-af53-2ef9d0bc8e0a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pr_reloaded.data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19662eaa-46de-4eb0-8c78-ddd6c68b12db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "first_pur_key = list(json_reloaded['unit_results'].keys())[0]\n",
+    "pur_reloaded = openfe.ProtocolUnit.from_dict(json_reloaded['unit_results'][first_pur_key])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0154fda2-4c1a-4064-8bcc-03aeecf13365",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pur_reloaded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f2bbc84-f59c-40b9-a176-9a733ff275c1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/openfecli/tests/data/results_parallel.tar.gz b/openfecli/tests/data/results_parallel.tar.gz