Skip to content

Commit

Permalink
create test data for quickrun submitted in parallel (#1040)
Browse files Browse the repository at this point in the history
  • Loading branch information
atravitz authored Dec 9, 2024
1 parent c2e4ee9 commit 55c9924
Show file tree
Hide file tree
Showing 3 changed files with 260 additions and 4 deletions.
19 changes: 15 additions & 4 deletions openfecli/tests/commands/test_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,25 @@ def test_get_column(val, col):


@pytest.fixture
def results_dir(tmpdir):
def results_dir_serial(tmpdir):
"""Example output data, with replicates run in serial (3 replicates per results JSON)."""
with tmpdir.as_cwd():
with resources.files('openfecli.tests.data') as d:
t = tarfile.open(d / 'rbfe_results.tar.gz', mode='r')
t.extractall('.')

yield

@pytest.fixture
def results_dir_parallel(tmpdir):
"""Identical data to results_dir_serial(), with replicates run in parallel (1 replicate per results JSON)."""
with tmpdir.as_cwd():
with resources.files('openfecli.tests.data') as d:
t = tarfile.open(d / 'results_parallel.tar.gz', mode='r')
t.extractall('.')

yield

_EXPECTED_DG = b"""
ligand DG(MLE) (kcal/mol) uncertainty (kcal/mol)
lig_ejm_31 -0.09 0.05
Expand Down Expand Up @@ -146,7 +157,7 @@ def results_dir(tmpdir):


@pytest.mark.parametrize('report', ["", "dg", "ddg", "raw"])
def test_gather(results_dir, report):
def test_gather(results_dir_serial, report):
expected = {
"": _EXPECTED_DG,
"dg": _EXPECTED_DG,
Expand Down Expand Up @@ -185,7 +196,7 @@ def test_generate_bad_legs_error_message(include):


@pytest.mark.xfail
def test_missing_leg_error(results_dir):
def test_missing_leg_error(results_dir_serial):
file_to_remove = "easy_rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json"
(pathlib.Path("results") / file_to_remove).unlink()

Expand All @@ -199,7 +210,7 @@ def test_missing_leg_error(results_dir):


@pytest.mark.xfail
def test_missing_leg_allow_partial(results_dir):
def test_missing_leg_allow_partial(results_dir_serial):
file_to_remove = "easy_rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json"
(pathlib.Path("results") / file_to_remove).unlink()

Expand Down
245 changes: 245 additions & 0 deletions openfecli/tests/data/restructure_results_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "8d1899bc-337a-4024-9fa3-9cfbc452e091",
"metadata": {},
"outputs": [],
"source": [
"import json \n",
"from gufe.tokenization import JSON_HANDLER\n",
"import numpy as np\n",
"import os \n",
"import shutil\n",
"from pathlib import Path"
]
},
{
"cell_type": "markdown",
"id": "a82b8123-521a-4ca3-a2cf-f73b6504fa14",
"metadata": {},
"source": [
"for this dataset, we know we have 3 replicates run in serial for each leg. We want to manipulate the data so that it is equivalent to the output if we re-ran this dataset with each leg run in parallel, with the following directory structure:\n",
"\n",
"```\n",
"results/\n",
" transformations_0/\n",
" rbfe_lig_ejm_31_complex_lig_ejm_42_complex/\n",
" shared_[hashA]_attempt_0/\n",
" rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json\n",
" transformations_1/\n",
" rbfe_lig_ejm_31_complex_lig_ejm_42_complex/\n",
" shared_[hashB]_attempt_0/\n",
" rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json\n",
" transformations_2/\n",
" rbfe_lig_ejm_31_complex_lig_ejm_42_complex/\n",
" shared_[hashC]_attempt_0/\n",
" rbfe_lig_ejm_31_complex_lig_ejm_42_complex.json\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c6ed7fe-b42c-4781-b356-85799e25356f",
"metadata": {},
"outputs": [],
"source": [
"def load_json(fpath):\n",
" return json.load(open(fpath, 'r'), cls=JSON_HANDLER.decoder)\n",
"\n",
"def dump_json(data, fpath):\n",
" with open(fpath, \"w\") as f:\n",
" json.dump(data, f, cls=JSON_HANDLER.encoder)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8eba246a-6123-4d8e-8fd8-2de516fbf881",
"metadata": {},
"outputs": [],
"source": [
"orig_dir = Path(\"results/\")\n",
"new_dir = Path(\"results_parallel/\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab4f2587-9b15-422d-9faa-e11ff98fd491",
"metadata": {},
"outputs": [],
"source": [
"leg_names = []\n",
"for name in os.listdir(orig_dir):\n",
" if name.endswith(\".json\"):\n",
" continue\n",
" leg_names.append(name)\n",
"leg_names"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "311a7f0e-9c91-47ae-9e09-0e1bef03aca8",
"metadata": {},
"outputs": [],
"source": [
"! rm -rf $new_dir\n",
"for leg in leg_names:\n",
" json_data = load_json(orig_dir/f\"{leg}.json\")\n",
" srckey_to_protocol = {}\n",
" srckey_to_unit_results = {}\n",
" srckey_to_estimate = {}\n",
" ## collect results on a per-replicate basis\n",
" for k in json_data['protocol_result']['data']: \n",
" rep_source_key = json_data['protocol_result']['data'][k][0]['source_key']\n",
" \n",
" # keep only the data for this replicate\n",
" rep_result = json_data['protocol_result'].copy()\n",
" rep_result['data']={k:json_data['protocol_result']['data'][k]}\n",
" srckey_to_protocol[rep_source_key] = rep_result\n",
"\n",
" # pull just the estimate value so we can put it at the top of the output\n",
" srckey_to_estimate[rep_source_key] = rep_result['data'][k][0]['outputs']['unit_estimate']\n",
" \n",
" for k in json_data['unit_results']:\n",
" rep_source_key = json_data['unit_results'][k]['source_key']\n",
"\n",
" rep_unit_result = json_data['unit_results'].copy()\n",
" rep_unit_result = {k: json_data['unit_results'][k]}\n",
" srckey_to_unit_results[rep_source_key] = rep_unit_result\n",
" \n",
" assert srckey_to_protocol.keys() == srckey_to_unit_results.keys()\n",
" \n",
" ## write to the new directory\n",
" for n, sk in enumerate(sorted(srckey_to_protocol.keys())):\n",
" rep_dir = new_dir/f\"replicate_{n}\"\n",
" os.makedirs(rep_dir/leg)\n",
" \n",
" # build up the data for this replicate\n",
" replicate_data = {'estimate': srckey_to_estimate[sk],\n",
" 'uncertainty': np.std(srckey_to_estimate[sk]),\n",
" 'protocol_result': srckey_to_protocol[sk],\n",
" 'unit_results': srckey_to_unit_results[sk]}\n",
" \n",
" # write!\n",
" dump_json(replicate_data, rep_dir/f\"{leg}.json\")\n",
" working_dir_name = f\"shared_{sk}_attempt_0\"\n",
" ## TODO: make this work for arbitrary number of attempts \n",
" # os.symlink(orig_dir/leg/working_dir_name, rep_dir/leg/working_dir_name)\n",
" shutil.copytree(orig_dir/leg/working_dir_name, rep_dir/leg/working_dir_name)\n"
]
},
{
"cell_type": "markdown",
"id": "f864dcb3-bebf-425b-9154-bffc2b0e3f07",
"metadata": {},
"source": [
"## check that objects reload correctly"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6c20639c-8ba7-457a-bf8a-76c64aef4a38",
"metadata": {},
"outputs": [],
"source": [
"import openfe"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9cba8316-5500-4d5e-a84d-d72d09ba2a42",
"metadata": {},
"outputs": [],
"source": [
"json_reloaded = load_json(\"results_parallel/replicate_0/easy_rbfe_lig_ejm_31_solvent_lig_ejm_47_solvent.json\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0e90b45-ae83-41c1-8748-0a8c1466b378",
"metadata": {},
"outputs": [],
"source": [
"json_reloaded['estimate'], json_reloaded['uncertainty']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0ce2bc6-d960-4521-b71c-316be0557e9d",
"metadata": {},
"outputs": [],
"source": [
"pr_reloaded = openfe.ProtocolResult.from_dict(json_reloaded['protocol_result'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a2fbb695-d4ef-45bd-af53-2ef9d0bc8e0a",
"metadata": {},
"outputs": [],
"source": [
"pr_reloaded.data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19662eaa-46de-4eb0-8c78-ddd6c68b12db",
"metadata": {},
"outputs": [],
"source": [
"first_pur_key = list(json_reloaded['unit_results'].keys())[0]\n",
"pur_reloaded = openfe.ProtocolUnit.from_dict(json_reloaded['unit_results'][first_pur_key])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0154fda2-4c1a-4064-8bcc-03aeecf13365",
"metadata": {},
"outputs": [],
"source": [
"pur_reloaded"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3f2bbc84-f59c-40b9-a176-9a733ff275c1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Binary file added openfecli/tests/data/results_parallel.tar.gz
Binary file not shown.

0 comments on commit 55c9924

Please sign in to comment.