Merge pull request #1799 from NNPDF/run_wo_lhapdf

Add an `lhapdf_compatibility` module for LHAPDF
NNPDF · Jan 31, 2024 · 62556ce · 62556ce
2 parents b06492e + 37812ac
commit 62556ce
Show file tree

Hide file tree

Showing 7 changed files with 165 additions and 44 deletions.
diff --git a/n3fit/src/n3fit/backends/keras_backend/internal_state.py b/n3fit/src/n3fit/backends/keras_backend/internal_state.py
@@ -2,21 +2,22 @@
     Library of functions that modify the internal state of Keras/Tensorflow
 """
 import os
+
 import psutil
 
 # Despite the current default being tf-eigen, the option below seems to have a positive impact
 os.environ.setdefault("KMP_BLOCKTIME", "0")
 
 # Reduce tensorflow verbosity
 os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "1")
-import random as rn
 import logging
+import random as rn
+
 import numpy as np
 import tensorflow as tf
 from tensorflow import keras
 from tensorflow.keras import backend as K
 
-
 log = logging.getLogger(__name__)
 
 
@@ -28,7 +29,7 @@ def set_eager(flag=True):
     tf.config.run_functions_eagerly(flag)
 
 
-def set_number_of_cores(max_cores=None):
+def set_number_of_cores(max_cores=None, max_threads=None):
     """
     Set the maximum number of cores and threads per core to be used by TF.
     It defaults to the number of physical cores
@@ -56,9 +57,21 @@ def set_number_of_cores(max_cores=None):
     # In any case, we never want to get above the number provided by the user
     if max_cores is not None:
         cores = min(cores, max_cores)
+
+    threads = tpc * 2
+    if max_threads is not None:
+        threads = min(max_threads, threads)
+
     log.info("Setting the number of cores to: %d", cores)
-    tf.config.threading.set_inter_op_parallelism_threads(tpc * 2)
-    tf.config.threading.set_intra_op_parallelism_threads(cores)
+    try:
+        tf.config.threading.set_inter_op_parallelism_threads(threads)
+        tf.config.threading.set_intra_op_parallelism_threads(cores)
+    except RuntimeError:
+        # If pdfflow is being used, tensorflow will already be initialized by pdfflow
+        # maybe it would be good to drop completely pdfflow before starting the fit? (TODO ?)
+        log.warning(
+            "Could not set tensorflow parallelism settings from n3fit, maybe has already been initialized?"
+        )
 
 
 def clear_backend_state():
@@ -115,13 +128,12 @@ def set_initial_state(debug=False, external_seed=None, max_cores=None):
 
     # Set the number of cores depending on the user choice of max_cores
     # if debug mode and no number of cores set by the user, set to 1
+    threads = None  # auto
     if debug and max_cores is None:
         keras.utils.set_random_seed(7331)
+        threads = 1
         tf.config.experimental.enable_op_determinism()
-        tf.config.threading.set_inter_op_parallelism_threads(1)
-        tf.config.threading.set_intra_op_parallelism_threads(1)
-    else:
-        set_number_of_cores(max_cores=max_cores)
+    set_number_of_cores(max_cores=max_cores, max_threads=threads)
 
     # Once again, if in debug mode or external_seed set, set also the TF seed
     if debug or external_seed:

diff --git a/validphys2/src/validphys/app.py b/validphys2/src/validphys/app.py
@@ -14,11 +14,10 @@
 import os
 import sys
 
-import lhapdf
-
 from reportengine import app
 from validphys import mplstyles, uploadutils
 from validphys.config import Config, Environment
+from validphys.lhapdf_compatibility import lhapdf
 
 providers = [
     "validphys.results",

diff --git a/validphys2/src/validphys/checks.py b/validphys2/src/validphys/checks.py
@@ -9,7 +9,6 @@
 import platform
 import tempfile
 
-import lhapdf
 from matplotlib import scale as mscale
 
 from reportengine.checks import CheckError, check, make_argcheck, make_check
@@ -71,7 +70,7 @@ def check_can_save_grid(ns, **kwags):
     if not ns['installgrid']:
         return
 
-    write_path = lhapdf.paths()[-1]
+    write_path = lhaindex.get_lha_datapaths()
     try:
         tempfile.TemporaryFile(dir=write_path)
     except OSError as e:

diff --git a/validphys2/src/validphys/lhaindex.py b/validphys2/src/validphys/lhaindex.py
@@ -12,9 +12,8 @@
 import os.path as osp
 import re
 
-import lhapdf
-
 from reportengine.compat import yaml
+from validphys.lhapdf_compatibility import lhapdf
 
 _indexes_to_names = None
 _names_to_indexes = None
@@ -25,7 +24,7 @@ def expand_index_names(globstr):
 
 
 def expand_local_names(globstr):
-    paths = get_lha_paths()
+    paths = lhapdf.paths()
     return [
         name
         for path in paths
@@ -51,7 +50,7 @@ def get_indexes_to_names():
 
 
 def finddir(name):
-    for path in get_lha_paths():
+    for path in lhapdf.paths():
         d = osp.join(path, name)
         if osp.isdir(d):
             return d
@@ -60,7 +59,7 @@ def finddir(name):
 
 def isinstalled(name):
     """Check that name exists in LHAPDF dir"""
-    return name and any(osp.isdir(osp.join(path, name)) for path in get_lha_paths())
+    return name and any(osp.isdir(osp.join(path, name)) for path in lhapdf.paths())
 
 
 def get_names_to_indexes():
@@ -88,7 +87,7 @@ def get_pdf_name(index):
 
 def parse_index(index_file):
     d = {}
-    name_re = '(\d+)\s+(\S+)'
+    name_re = r'(\d+)\s+(\S+)'
     with open(index_file) as localfile:
         for line in localfile.readlines():
             m = re.match(name_re, line)
@@ -116,7 +115,7 @@ def as_from_name(name):
 
 
 def infofilename(name):
-    for path in get_lha_paths():
+    for path in lhapdf.paths():
         info = osp.join(path, name, name + '.info')
         if osp.exists(info):
             return info
@@ -130,12 +129,8 @@ def parse_info(name):
     return result
 
 
-def get_lha_paths():
-    return lhapdf.paths()
-
-
 def get_lha_datapath():
-    return get_lha_paths()[-1]
+    return lhapdf.paths()[-1]
 
 
 def get_index_path(folder=None):

diff --git a/validphys2/src/validphys/lhapdf_compatibility.py b/validphys2/src/validphys/lhapdf_compatibility.py
@@ -0,0 +1,125 @@
+"""
+    Module for LHAPDF compatibility backends
+
+    If LHAPDF is installed, the module will transparently hand over everything to LHAPDF
+    if LHAPDF is not available, it will try to use a combination of the packages
+        `lhapdf-management` and `pdfflow`
+    which cover all the features of LHAPDF used during the fit (and likely most of validphys)
+"""
+from functools import cached_property
+
+import numpy as np
+
+try:
+    import lhapdf
+
+    USING_LHAPDF = True
+except ModuleNotFoundError:
+    import logging
+
+    import lhapdf_management as lhapdf
+
+    log = logging.getLogger(__name__)
+    log.warning("LHAPDF was not found, using an alternative backend")
+
+    USING_LHAPDF = False
+
+
+class _PDFFlowPDF:
+    """Wrapper around the PDFFlow PDF so that it can be used as an LHAPDF
+    set by validphys
+    Takes as input a pdf_meta object (which is a PDFset from lhapdf_management
+    and which knows where the PDF needs to be loaded from) and a single member
+
+    Loading the PDF is done in a lazy manner since most of the time only a few members are needed.
+
+    Since PDFFlow is only utilized to load the PDF for interpolation, the import is delayed until
+    the first call to `mkPDF`. This allows the usage of most of validphys without tensorflow.
+    """
+
+    def __init__(self, pdf_meta, member):
+        if USING_LHAPDF:
+            raise ValueError("PDFFlow should not be instantiated when using LHAPDF")
+
+        self._pdf_meta = pdf_meta
+        self._m = member
+        self._pdf = None
+        self._flavors = self._pdf_meta.info["Flavors"]
+
+    @cached_property
+    def pdf(self):
+        # Don't import PDF Flow until you really needed it
+        import pdfflow
+
+        if self._pdf is None:
+            pdf_def = f"{self._pdf_meta.name}/{self._m}"
+            self._pdf = pdfflow.mkPDF(pdf_def, self._pdf_meta.path.parent)
+        return self._pdf
+
+    def flavors(self):
+        return self._flavors
+
+    def _xfxQ_all_pid(self, x, q):
+        x = np.atleast_1d(x)
+        q = np.atleast_1d(q)
+
+        res = self.pdf.py_xfxQ2_allpid(x, q**2).numpy()
+        return dict(zip(self._flavors, res.T))
+
+    def xfxQ(self, a, b, c=None):
+        """Wrapper for the LHAPDF xfxQ function
+        This is an overloaded function in LHAPDF so depending
+        on the number of arguments we will do:
+            xfxQ(flavours, x, Q)
+        or
+            xfxQ(x, q)
+
+        All of x/q/flavours can be either a scalar or an array
+        """
+        if c is None:
+            return self._xfxQ_all_pid(a, b)
+
+        # PDFFlow doesn't allow to ask for flavours that do not exist
+        # so let us retrieve all and return 0s for non existing flavs
+        ret_dict = self.xfxQ(b, c)
+        zeros = np.zeros_like(b)
+
+        if isinstance(a, int):
+            return ret_dict.get(a, zeros)
+        return [ret_dict.get(i, zeros) for i in a]
+
+    def xfxQ2(self, a, b, c=None):
+        """Wrapper for LHAPDF xfxQ2 function, like xfxQ for Q2"""
+        if c is None:
+            return self.xfxQ(a, np.sqrt(b))
+        return self.xfxQ(a, b, np.sqrt(c))
+
+
+def make_pdf(pdf_name, member=None):
+    """Load a PDF
+    if member is given, load the single member otherwise, load the entire set as a list
+
+    if LHAPDF is provided, it returns LHAPDF PDF instances
+    otherwise it returns and object which is _compatible_ with LHAPDF
+    for lhapdf functions for the selected backend
+
+    Parameters:
+    ----------
+        pdf_name: str
+            name of the PDF to load
+        member: int
+            index of the member of the PDF to load
+
+    Returns:
+    -------
+        list(pdf_sets)
+    """
+    if USING_LHAPDF:
+        if member is None:
+            return lhapdf.mkPDFs(pdf_name)
+        return [lhapdf.mkPDF(pdf_name, member)]
+
+    pdf_meta = lhapdf.load_pdf_meta(pdf_name)
+    if member is None:
+        return [_PDFFlowPDF(pdf_meta, m) for m in range(len(pdf_meta))]
+    return [_PDFFlowPDF(pdf_meta, member)]
diff --git a/validphys2/src/validphys/lhapdfset.py b/validphys2/src/validphys/lhapdfset.py
@@ -28,9 +28,10 @@
 """
 import logging
 
-import lhapdf
 import numpy as np
 
+from validphys.lhapdf_compatibility import make_pdf
+
 log = logging.getLogger(__name__)
 
 
@@ -46,9 +47,9 @@ def __init__(self, name, error_type):
         self._error_type = error_type
         if self.is_t0:
             # If at this point we already know this is a T0 set, load only the CV
-            self._lhapdf_set = [lhapdf.mkPDF(name)]
+            self._lhapdf_set = make_pdf(name, 0)
         else:
-            self._lhapdf_set = lhapdf.mkPDFs(name)
+            self._lhapdf_set = make_pdf(name)
         self._flavors = None
 
     @property

diff --git a/validphys2/src/validphys/lhio.py b/validphys2/src/validphys/lhio.py
@@ -8,7 +8,6 @@
 import pathlib
 import shutil
 
-import lhapdf
 import numpy as np
 import pandas as pd
 
@@ -137,9 +136,7 @@ def big_matrix(gridlist):
     and the central value"""
     central_value = gridlist[0]
     X = pd.concat(
-        gridlist[1:],
-        axis=1,
-        keys=range(1, len(gridlist) + 1),  # avoid confusion with rep0
+        gridlist[1:], axis=1, keys=range(1, len(gridlist) + 1)  # avoid confusion with rep0
     ).subtract(central_value, axis=0)
     if np.any(X.isnull()) or X.shape[0] != len(central_value):
         raise ValueError("Incompatible grid specifications")
@@ -148,11 +145,7 @@ def big_matrix(gridlist):
 
 def rep_matrix(gridlist):
     """Return a properly indexes matrix of all the members"""
-    X = pd.concat(
-        gridlist,
-        axis=1,
-        keys=range(1, len(gridlist) + 1),  # avoid confusion with rep0
-    )
+    X = pd.concat(gridlist, axis=1, keys=range(1, len(gridlist) + 1))  # avoid confusion with rep0
     if np.ravel(pd.isnull(X)).any():
         raise ValueError("Found null values in grid")
     return X
@@ -239,6 +232,7 @@ def new_pdf_from_indexes(
         files directly. It is slower and will call LHAPDF to fill the grids,
         but works for sets where the replicas have different grids.
     """
+    import lhapdf
 
     if extra_fields is not None:
         raise NotImplementedError()
@@ -303,7 +297,7 @@ def hessian_from_lincomb(pdf, V, set_name=None, folder=None, extra_fields=None):
     # preparing output folder
     neig = V.shape[1]
 
-    base = pathlib.Path(lhapdf.paths()[-1]) / pdf.name
+    base = pathlib.Path(lhaindex.get_lha_datapath()) / pdf.name
     if set_name is None:
         set_name = pdf.name + "_hessian_" + str(neig)
     if folder is None:
@@ -314,8 +308,7 @@ def hessian_from_lincomb(pdf, V, set_name=None, folder=None, extra_fields=None):
     if os.path.exists(set_root):
         shutil.rmtree(set_root)
         log.warning(
-            "Target directory for new PDF, %s, already exists. Removing contents.",
-            set_root,
+            "Target directory for new PDF, %s, already exists. Removing contents.", set_root
         )
     os.makedirs(os.path.join(set_root))
 
@@ -336,10 +329,7 @@ def hessian_from_lincomb(pdf, V, set_name=None, folder=None, extra_fields=None):
             yaml.dump(extra_fields, out, default_flow_style=False)
 
     _headers, grids = load_all_replicas(pdf)
-    result = (big_matrix(grids).dot(V)).add(
-        grids[0],
-        axis=0,
-    )
+    result = (big_matrix(grids).dot(V)).add(grids[0], axis=0)
     hess_header = b"PdfType: error\nFormat: lhagrid1\n"
     for column in result.columns:
         write_replica(column + 1, set_root, hess_header, result[column])