From e84bccc799eedd1369e73329cb07c7c9f7c01429 Mon Sep 17 00:00:00 2001 From: MorrowChem Date: Mon, 10 Jun 2024 12:05:42 +0100 Subject: [PATCH 1/7] added test for regularization, cleaned up reg funcs, added extra docs, imprv robustness, attributed reg.py code --- autoplex/fitting/common/regularization.py | 136 ++++++++++--------- tests/fitting/test_fitting_regularization.py | 55 ++++++++ 2 files changed, 127 insertions(+), 64 deletions(-) create mode 100644 tests/fitting/test_fitting_regularization.py diff --git a/autoplex/fitting/common/regularization.py b/autoplex/fitting/common/regularization.py index c1592e46b..edbdf5a85 100644 --- a/autoplex/fitting/common/regularization.py +++ b/autoplex/fitting/common/regularization.py @@ -1,4 +1,4 @@ -"""Functions for automatic regularization and weighting of training data.""" +"""Functions for automatic regularization and weighting of training data, adapted from JDM's RSS routines.""" from __future__ import annotations @@ -11,27 +11,31 @@ def set_sigma( atoms, - etup, + reg_minmax, isol_es: dict | None = None, scheme="linear-hull", energy_name="REF_energy", force_name="REF_forces", virial_name="REF_virial", element_order=None, + max_energy=20.0, + config_type_override=None, ): """ Handle automatic regularisation based on distance to convex hull, amongst other things. - Need to make sure this works for multi-stoichiometry systems. + TODO: Need to make sure this works for full multi-stoichiometry systems. Parameters ---------- atoms: (list of ase.Atoms) list of atoms objects to set reg. for. Usually fitting database - etup: (list of tuples) + reg_minmax: (list of tuples) list of tuples of (min, max) values for energy, force, virial sigmas scheme: (str) - scheme to use for regularisation. Options are: linear_hull, volume-stoichiometry + method to use for regularization. Options are: linear_hull, volume-stoichiometry + linear_hull: for single-composition system, use 2D convex hull (E, V) + volume-stoichiometry: for multi-composition system, use 3D convex hull of (E, V, mole-fraction) energy_name: (str) name of energy key in atoms.info force_name: (str) @@ -44,25 +48,40 @@ def set_sigma( for SiO2 element_order: (list) list of atomic numbers in order of choice (e.g. [42, 16] for MoS2) + max_energy: (float) + ignore any structures with energy above hull greater than this value (eV) + config_type_override: (dict) + give custom regularization for specific config types - e.g. etup = [(0.1, 1), (0.001, 0.1), (0.0316, 0.316), (0.0632, 0.632)] + e.g. reg_minmax = [(0.1, 1), (0.001, 0.1), (0.0316, 0.316), (0.0632, 0.632)] [(emin, emax), (semin, semax), (sfmin, sfmax), (ssvmin, svmax)] """ sigs = [[], [], []] # type: ignore atoms_modi = [] + if config_type_override is None: + config_type_override = { + "IsolatedAtom": (1e-4, 0.0, 0.0), + "dimer": (0.1, 0.5, 0.0), + } + for at in atoms: + for config_type, sigs_override in config_type_override.items(): + if at.info["config_type"] == config_type: + at.info["energy_sigma"] = sigs_override[0] + at.info["force_sigma"] = sigs_override[1] + at.info["virial_sigma"] = sigs_override[2] + atoms_modi.append(at) + if at.info["config_type"] == "IsolatedAtom": - # isol_es - at.info["energy_sigma"] = 0.0001 at.calc = None # TODO side-effect alert + del at.info["force_sigma"] + del at.info["virial_sigma"] atoms_modi.append(at) continue if at.info["config_type"] == "dimer": - at.info["energy_sigma"] = 0.1 - at.info["force_sigma"] = 0.5 with suppress(Exception): del at.info[virial_name] @@ -70,16 +89,11 @@ def set_sigma( continue if scheme == "linear-hull": - # Use this one for a simple single-composition system. - # Makes a 2D convex hull of volume vs. energy and calculates distance to it print("Regularising with linear hull") hull, p = get_convex_hull(atoms, energy_name=energy_name) - # print("got hull pts", [p1 for p1 in zip(hull[:, 0], hull[:, 1])]) get_e_distance_func = get_e_distance_to_hull elif scheme == "volume-stoichiometry": - # Use this one for a binary or pseudo-binary-composition system. - # Makes a 3D convex hull of volume vs. mole fraction vs. energy and calculates distance to it print("Regularising with 3D volume-mole fraction hull") if isol_es == {}: raise ValueError("Need to supply dictionary of isolated energies.") @@ -89,7 +103,6 @@ def set_sigma( ) # label atoms with volume and mole fraction hull = calculate_hull_3D(p) # calculate 3D convex hull get_e_distance_func = get_e_distance_to_hull_3D # type: ignore - # function to calculate distance to hull (in energy) p = {} for group in sorted( @@ -118,48 +131,50 @@ def set_sigma( hull, val, energy_name=energy_name, isol_es=isol_es ) - if de > 20.0: + if de > max_energy: # don't even fit if too high continue - if val.info["config_type"] != "IsolatedAtom": + if val.info["config_type"] not in config_type_override: if group == "initial": - sigs[0].append(etup[1][1]) - sigs[1].append(etup[2][1]) - sigs[2].append(etup[3][1]) - val.info["energy_sigma"] = etup[1][1] - val.info["force_sigma"] = etup[2][1] - val.info["virial_sigma"] = etup[3][1] + sigs[0].append(reg_minmax[1][1]) + sigs[1].append(reg_minmax[2][1]) + sigs[2].append(reg_minmax[3][1]) + val.info["energy_sigma"] = reg_minmax[1][1] + val.info["force_sigma"] = reg_minmax[2][1] + val.info["virial_sigma"] = reg_minmax[3][1] atoms_modi.append(val) continue - if de <= etup[0][0]: - sigs[0].append(etup[1][0]) - sigs[1].append(etup[2][0]) - sigs[2].append(etup[3][0]) - val.info["energy_sigma"] = etup[1][0] - val.info["force_sigma"] = etup[2][0] - val.info["virial_sigma"] = etup[3][0] + if de <= reg_minmax[0][0]: + sigs[0].append(reg_minmax[1][0]) + sigs[1].append(reg_minmax[2][0]) + sigs[2].append(reg_minmax[3][0]) + val.info["energy_sigma"] = reg_minmax[1][0] + val.info["force_sigma"] = reg_minmax[2][0] + val.info["virial_sigma"] = reg_minmax[3][0] atoms_modi.append(val) - elif de >= etup[0][1]: - sigs[0].append(etup[1][1]) - sigs[1].append(etup[2][1]) - sigs[2].append(etup[3][1]) - val.info["energy_sigma"] = etup[1][1] - val.info["force_sigma"] = etup[2][1] - val.info["virial_sigma"] = etup[3][1] + elif de >= reg_minmax[0][1]: + sigs[0].append(reg_minmax[1][1]) + sigs[1].append(reg_minmax[2][1]) + sigs[2].append(reg_minmax[3][1]) + val.info["energy_sigma"] = reg_minmax[1][1] + val.info["force_sigma"] = reg_minmax[2][1] + val.info["virial_sigma"] = reg_minmax[3][1] atoms_modi.append(val) else: - # rat = (de-etup[0][0]) / (etup[0][1]-etup[0][0]) - # e = rat*(etup[1][1]-etup[1][0]) + etup[1][0] - # f = rat*(etup[2][1]-etup[2][0]) + etup[2][0] - # v = rat*(etup[3][1]-etup[3][0]) + etup[3][0] [e, f, v] = piecewise_linear( de, [ - (0.1, [etup[1][0], etup[2][0], etup[3][0]]), - (1.0, [etup[1][1], etup[2][1], etup[3][1]]), + ( + 0.1, + [reg_minmax[1][0], reg_minmax[2][0], reg_minmax[3][0]], + ), + ( + 1.0, + [reg_minmax[1][1], reg_minmax[2][1], reg_minmax[3][1]], + ), ], ) sigs[0].append(e) @@ -174,15 +189,17 @@ def set_sigma( data_type = [np.array(sig) for sig in sigs] # [e, f, v] for label, data in zip(labels, data_type): + if len(data) == 0: + print("No automatic regularisation performed (no structures requested)") + continue if label == "E": - # Print header + # Report of the regularisation statistics print(f"Automatic regularisation statistics for {len(data)} structures:\n") print( "{:>20s}{:>20s}{:>20s}{:>20s}{:>20s}".format( "", "Mean", "Std", "Nmin", "Nmax" ) ) - print( "{:>20s}{:>20.4f}{:>20.4f}{:>20d}{:>20d}".format( label, @@ -198,7 +215,7 @@ def set_sigma( def get_convex_hull(atoms, energy_name="energy", **kwargs): """ - Calculate simple linear convex hull of volume vs. energy. + Calculate simple linear (E,V) convex hull. Parameters ---------- @@ -209,7 +226,8 @@ def get_convex_hull(atoms, energy_name="energy", **kwargs): Returns ------- - the list of points in the convex hull (lower half only), and additionally all the points for testing purposes + the list of points in the convex hull (lower half only), + and additionally all the points for testing purposes """ p = [] @@ -225,7 +243,8 @@ def get_convex_hull(atoms, energy_name="energy", **kwargs): p.append((v, e)) except Exception: ct += 1 - print(f"Convex hull failed to include {ct}/{len(atoms)} structures") + if ct > 0: + print(f"Convex hull failed to include {ct}/{len(atoms)} structures") p = np.array(p) p = p.T[:, np.argsort(p.T[0])].T # sort in volume axis @@ -251,8 +270,6 @@ def get_convex_hull(atoms, energy_name="energy", **kwargs): lower_half_hull_points[:, 1] <= np.max(lower_half_hull_points[:, 1]) ] - # plot_convex_hull(p, lower_half_hull_points) - return lower_half_hull_points, p @@ -273,7 +290,6 @@ def get_e_distance_to_hull(hull, at, energy_name="energy", **kwargs): volume = at.get_volume() / len(at) energy = at.info[energy_name] / len(at) tp = np.array([volume, energy]) - hull_ps = hull.points if isinstance(hull, ConvexHull) else hull if any( @@ -288,8 +304,8 @@ def get_e_distance_to_hull(hull, at, energy_name="energy", **kwargs): return ( energy - get_intersect( - tp, # get intersection of the vertical line (energy axis) and the line between the nearest hull points - tp + np.array([0, 1]), + tp, # get intersection of the vertical line (energy axis) + tp + np.array([0, 1]), # and the line between the nearest hull points hull_ps[(nearest - 1) % len(hull_ps.T[0])], hull_ps[nearest % len(hull_ps.T[0])], )[1] @@ -340,7 +356,6 @@ def get_x(at, element_order=None): el, cts = np.unique(at.get_atomic_numbers(), return_counts=True) if element_order is None and len(el) < 3: # compatibility with old version - # print('using old version of get_x') x = cts[1] / sum(cts) if len(el) == 2 else 1 else: # new version, requires element_order, recommended for all new calculations @@ -355,9 +370,7 @@ def get_x(at, element_order=None): el = np.array([el[np.argwhere(el == i).squeeze()] for i in element_order]) x = cts[1:] / sum(cts) - # print(x, at) - # print(at) return x @@ -370,7 +383,7 @@ def label_stoichiometry_volume(ats, isol_es, e_name, element_order=None): ats: (list) list of atoms objects isol_es: (dict) - dictionary of isolated atom energies + dictionary of isolated atom energies {atomic_number: energy} e_name: (str) name of energy key in atoms.info (typically a DFT energy) element_order: (list) @@ -378,7 +391,7 @@ def label_stoichiometry_volume(ats, isol_es, e_name, element_order=None): """ p = [] - for _ct, at in enumerate(ats): + for at in ats: try: v = at.get_volume() / len(at) # make energy relative to isolated atoms @@ -386,7 +399,6 @@ def label_stoichiometry_volume(ats, isol_es, e_name, element_order=None): at.info[e_name] - sum([isol_es[j] for j in at.get_atomic_numbers()]) ) / len(at) x = get_x(at, element_order=element_order) - # print(x) p.append(np.hstack((x, v, e))) except Exception: traceback.print_exc() @@ -503,7 +515,6 @@ def calculate_hull_ND(p): pn = np.delete(pn, i, axis=1) print(f"Convex hull lower dimensional - removing dimension {i}") remove_dim.append(i) - # print(pn) hull = ConvexHull(pn, qhull_options="QG0") print("done calculating hull") @@ -547,9 +558,6 @@ def get_e_distance_to_hull_3D( return get_e_distance_to_hull(hull, at, energy_name=energy_name) for _ct, visible_facet in enumerate(hull.simplices[hull.good]): - # print(sp[:-1]) - # print(*hull.points[visible_facet][:, :-1]) - if point_in_triangle_ND(sp[:-1], *hull.points[visible_facet][:, :-1]): n_3 = hull.points[visible_facet] e = sp[-1] diff --git a/tests/fitting/test_fitting_regularization.py b/tests/fitting/test_fitting_regularization.py new file mode 100644 index 000000000..56a77fa5c --- /dev/null +++ b/tests/fitting/test_fitting_regularization.py @@ -0,0 +1,55 @@ +from __future__ import annotations +import pytest +from ase.io import read +from ase.atoms import Atom +from autoplex.fitting.common.regularization import ( + set_sigma +) + + +def test_set_sigma(test_dir): + # data setup + test_atoms = read(test_dir / 'fitting/pre_xyz_train_more_data.extxyz', ':') + isol_es = {3: -0.28649227, 17: -0.28649227} + reg_minmax = [(0.1, 1), (0.001, 0.1), + (0.0316, 0.316), + (0.0632, 0.632)] + + # test series of options for set_sigma + + atoms_modi = set_sigma(test_atoms, + reg_minmax, + scheme='linear-hull',) + assert atoms_modi[2].info['energy_sigma'] == 0.001 + + + atoms_modi = set_sigma(test_atoms, + reg_minmax, + scheme='linear-hull', + config_type_override={'test': [1e-4, 1e-4, 1e-4]} + ) + assert atoms_modi[2].info['energy_sigma'] == 1e-4 + + + atoms_modi[0].info['REF_energy'] += 20 + for atoms in atoms_modi[:3]: + atoms.set_cell([10, 10, 10]) + for atoms in atoms_modi[4:]: + atoms.set_cell([11, 11, 11]) + atoms_modi = set_sigma(test_atoms, + reg_minmax, + scheme='linear-hull', + max_energy=0.05, + isol_es=isol_es + ) + assert len(atoms_modi) < len(test_atoms) + + + atoms_modi[0].append(Atom('Li', [1,1,1])) + atoms_modi = set_sigma(test_atoms, + reg_minmax, + scheme='volume-stoichiometry', + isol_es=isol_es + ) + assert True # TODO: modify this to test actual condition + \ No newline at end of file From cc2f81dfb188055bd8014802871ccb75078d6ffe Mon Sep 17 00:00:00 2001 From: MorrowChem Date: Mon, 10 Jun 2024 12:32:02 +0100 Subject: [PATCH 2/7] moved attribution from description string --- autoplex/fitting/common/regularization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autoplex/fitting/common/regularization.py b/autoplex/fitting/common/regularization.py index edbdf5a85..277bf4b6f 100644 --- a/autoplex/fitting/common/regularization.py +++ b/autoplex/fitting/common/regularization.py @@ -1,5 +1,6 @@ -"""Functions for automatic regularization and weighting of training data, adapted from JDM's RSS routines.""" +"""Functions for automatic regularization and weighting of training data.""" +# adapted from MorrowChem's RSS routines. from __future__ import annotations import traceback From 26eb3da6555ac861e6e6fc0a5df8361f1ad964a9 Mon Sep 17 00:00:00 2001 From: Christina Ertural <52951132+QuantumChemist@users.noreply.github.com> Date: Mon, 10 Jun 2024 15:59:34 +0200 Subject: [PATCH 3/7] reduce splits of the test --- .github/workflows/python-package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 406edfdfe..ecfb2b00f 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: python-version: ["3.9", "3.10", "3.11"] - split: [1, 2, 3, 4] # Number of splits + split: [1] #, 2, 3, 4] # Number of splits steps: - uses: actions/checkout@v3 @@ -50,7 +50,7 @@ jobs: run: | # run the line below locally to update tests durations file # pytest --cov=autoplex --cov-append --splits 1 --group 1 --durations-path ./tests/test_data/.pytest-split-durations --store-durations - pytest --cov=autoplex --cov-report term-missing --cov-append --splits 4 --group ${{ matrix.split }} -vv --durations-path ./tests/test_data/.pytest-split-durations + pytest --cov=autoplex --cov-report term-missing --cov-append --splits 1 --group ${{ matrix.split }} -vv --durations-path ./tests/test_data/.pytest-split-durations - name: Upload coverage uses: actions/upload-artifact@v3 with: From 686dabd0ceb6656f8696b282dcb13a2cf9cf1a67 Mon Sep 17 00:00:00 2001 From: MorrowChem Date: Mon, 10 Jun 2024 15:24:42 +0100 Subject: [PATCH 4/7] variable name change --- autoplex/fitting/common/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoplex/fitting/common/flows.py b/autoplex/fitting/common/flows.py index 6923d9f1d..558d75749 100644 --- a/autoplex/fitting/common/flows.py +++ b/autoplex/fitting/common/flows.py @@ -268,7 +268,7 @@ def make( ase.io.write("train_wo_sigma.extxyz", atoms, format="extxyz") atoms_with_sigma = set_sigma( atoms, - etup=[(0.1, 1), (0.001, 0.1), (0.0316, 0.316), (0.0632, 0.632)], + reg_minmax=[(0.1, 1), (0.001, 0.1), (0.0316, 0.316), (0.0632, 0.632)], ) ase.io.write("train.extxyz", atoms_with_sigma, format="extxyz") if self.separated: From 26c62be19abea7573c4524f2d6e6ef44100b2b11 Mon Sep 17 00:00:00 2001 From: Christina Ertural <52951132+QuantumChemist@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:24:23 +0200 Subject: [PATCH 5/7] increase splits of test --- .github/workflows/python-package.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index ecfb2b00f..480c2e059 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -5,7 +5,7 @@ name: Testing Linux on: push: - branches: [ main ] + branches: [ * ] pull_request: branches: [ main ] @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: python-version: ["3.9", "3.10", "3.11"] - split: [1] #, 2, 3, 4] # Number of splits + split: [1, 2, 3, 4] # Number of splits steps: - uses: actions/checkout@v3 @@ -50,7 +50,7 @@ jobs: run: | # run the line below locally to update tests durations file # pytest --cov=autoplex --cov-append --splits 1 --group 1 --durations-path ./tests/test_data/.pytest-split-durations --store-durations - pytest --cov=autoplex --cov-report term-missing --cov-append --splits 1 --group ${{ matrix.split }} -vv --durations-path ./tests/test_data/.pytest-split-durations + pytest --cov=autoplex --cov-report term-missing --cov-append --splits 4 --group ${{ matrix.split }} -vv --durations-path ./tests/test_data/.pytest-split-durations - name: Upload coverage uses: actions/upload-artifact@v3 with: From 38c88e7d5a20d4ced707921cbd428eda3aab3912 Mon Sep 17 00:00:00 2001 From: Aakash Ashok Naik <91958822+naik-aakash@users.noreply.github.com> Date: Tue, 11 Jun 2024 18:36:12 +0200 Subject: [PATCH 6/7] Update python-package.yml --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 480c2e059..c87c0c057 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -5,7 +5,7 @@ name: Testing Linux on: push: - branches: [ * ] + branches: '*' pull_request: branches: [ main ] From 5af00896cb852ea1a65ca3594833c9adae96f18d Mon Sep 17 00:00:00 2001 From: MorrowChem Date: Wed, 12 Jun 2024 11:39:32 +0100 Subject: [PATCH 7/7] bug fix - duplicated structures added for fitting --- autoplex/fitting/common/regularization.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/autoplex/fitting/common/regularization.py b/autoplex/fitting/common/regularization.py index 277bf4b6f..5aa207933 100644 --- a/autoplex/fitting/common/regularization.py +++ b/autoplex/fitting/common/regularization.py @@ -79,14 +79,12 @@ def set_sigma( at.calc = None # TODO side-effect alert del at.info["force_sigma"] del at.info["virial_sigma"] - atoms_modi.append(at) continue if at.info["config_type"] == "dimer": with suppress(Exception): del at.info[virial_name] - atoms_modi.append(at) continue if scheme == "linear-hull":