Skip to content

Commit

Permalink
Merge pull request #1799 from NNPDF/run_wo_lhapdf
Browse files Browse the repository at this point in the history
Add an `lhapdf_compatibility` module for LHAPDF
  • Loading branch information
scarlehoff authored Jan 31, 2024
2 parents b06492e + 37812ac commit 62556ce
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 44 deletions.
30 changes: 21 additions & 9 deletions n3fit/src/n3fit/backends/keras_backend/internal_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,22 @@
Library of functions that modify the internal state of Keras/Tensorflow
"""
import os

import psutil

# Despite the current default being tf-eigen, the option below seems to have a positive impact
os.environ.setdefault("KMP_BLOCKTIME", "0")

# Reduce tensorflow verbosity
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "1")
import random as rn
import logging
import random as rn

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K


log = logging.getLogger(__name__)


Expand All @@ -28,7 +29,7 @@ def set_eager(flag=True):
tf.config.run_functions_eagerly(flag)


def set_number_of_cores(max_cores=None):
def set_number_of_cores(max_cores=None, max_threads=None):
"""
Set the maximum number of cores and threads per core to be used by TF.
It defaults to the number of physical cores
Expand Down Expand Up @@ -56,9 +57,21 @@ def set_number_of_cores(max_cores=None):
# In any case, we never want to get above the number provided by the user
if max_cores is not None:
cores = min(cores, max_cores)

threads = tpc * 2
if max_threads is not None:
threads = min(max_threads, threads)

log.info("Setting the number of cores to: %d", cores)
tf.config.threading.set_inter_op_parallelism_threads(tpc * 2)
tf.config.threading.set_intra_op_parallelism_threads(cores)
try:
tf.config.threading.set_inter_op_parallelism_threads(threads)
tf.config.threading.set_intra_op_parallelism_threads(cores)
except RuntimeError:
# If pdfflow is being used, tensorflow will already be initialized by pdfflow
# maybe it would be good to drop completely pdfflow before starting the fit? (TODO ?)
log.warning(
"Could not set tensorflow parallelism settings from n3fit, maybe has already been initialized?"
)


def clear_backend_state():
Expand Down Expand Up @@ -115,13 +128,12 @@ def set_initial_state(debug=False, external_seed=None, max_cores=None):

# Set the number of cores depending on the user choice of max_cores
# if debug mode and no number of cores set by the user, set to 1
threads = None # auto
if debug and max_cores is None:
keras.utils.set_random_seed(7331)
threads = 1
tf.config.experimental.enable_op_determinism()
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(1)
else:
set_number_of_cores(max_cores=max_cores)
set_number_of_cores(max_cores=max_cores, max_threads=threads)

# Once again, if in debug mode or external_seed set, set also the TF seed
if debug or external_seed:
Expand Down
3 changes: 1 addition & 2 deletions validphys2/src/validphys/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@
import os
import sys

import lhapdf

from reportengine import app
from validphys import mplstyles, uploadutils
from validphys.config import Config, Environment
from validphys.lhapdf_compatibility import lhapdf

providers = [
"validphys.results",
Expand Down
3 changes: 1 addition & 2 deletions validphys2/src/validphys/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import platform
import tempfile

import lhapdf
from matplotlib import scale as mscale

from reportengine.checks import CheckError, check, make_argcheck, make_check
Expand Down Expand Up @@ -71,7 +70,7 @@ def check_can_save_grid(ns, **kwags):
if not ns['installgrid']:
return

write_path = lhapdf.paths()[-1]
write_path = lhaindex.get_lha_datapaths()
try:
tempfile.TemporaryFile(dir=write_path)
except OSError as e:
Expand Down
19 changes: 7 additions & 12 deletions validphys2/src/validphys/lhaindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@
import os.path as osp
import re

import lhapdf

from reportengine.compat import yaml
from validphys.lhapdf_compatibility import lhapdf

_indexes_to_names = None
_names_to_indexes = None
Expand All @@ -25,7 +24,7 @@ def expand_index_names(globstr):


def expand_local_names(globstr):
paths = get_lha_paths()
paths = lhapdf.paths()
return [
name
for path in paths
Expand All @@ -51,7 +50,7 @@ def get_indexes_to_names():


def finddir(name):
for path in get_lha_paths():
for path in lhapdf.paths():
d = osp.join(path, name)
if osp.isdir(d):
return d
Expand All @@ -60,7 +59,7 @@ def finddir(name):

def isinstalled(name):
"""Check that name exists in LHAPDF dir"""
return name and any(osp.isdir(osp.join(path, name)) for path in get_lha_paths())
return name and any(osp.isdir(osp.join(path, name)) for path in lhapdf.paths())


def get_names_to_indexes():
Expand Down Expand Up @@ -88,7 +87,7 @@ def get_pdf_name(index):

def parse_index(index_file):
d = {}
name_re = '(\d+)\s+(\S+)'
name_re = r'(\d+)\s+(\S+)'
with open(index_file) as localfile:
for line in localfile.readlines():
m = re.match(name_re, line)
Expand Down Expand Up @@ -116,7 +115,7 @@ def as_from_name(name):


def infofilename(name):
for path in get_lha_paths():
for path in lhapdf.paths():
info = osp.join(path, name, name + '.info')
if osp.exists(info):
return info
Expand All @@ -130,12 +129,8 @@ def parse_info(name):
return result


def get_lha_paths():
return lhapdf.paths()


def get_lha_datapath():
return get_lha_paths()[-1]
return lhapdf.paths()[-1]


def get_index_path(folder=None):
Expand Down
125 changes: 125 additions & 0 deletions validphys2/src/validphys/lhapdf_compatibility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
Module for LHAPDF compatibility backends
If LHAPDF is installed, the module will transparently hand over everything to LHAPDF
if LHAPDF is not available, it will try to use a combination of the packages
`lhapdf-management` and `pdfflow`
which cover all the features of LHAPDF used during the fit (and likely most of validphys)
"""
from functools import cached_property

import numpy as np

try:
import lhapdf

USING_LHAPDF = True
except ModuleNotFoundError:
import logging

import lhapdf_management as lhapdf

log = logging.getLogger(__name__)
log.warning("LHAPDF was not found, using an alternative backend")

USING_LHAPDF = False


class _PDFFlowPDF:
"""Wrapper around the PDFFlow PDF so that it can be used as an LHAPDF
set by validphys
Takes as input a pdf_meta object (which is a PDFset from lhapdf_management
and which knows where the PDF needs to be loaded from) and a single member
Loading the PDF is done in a lazy manner since most of the time only a few members are needed.
Since PDFFlow is only utilized to load the PDF for interpolation, the import is delayed until
the first call to `mkPDF`. This allows the usage of most of validphys without tensorflow.
"""

def __init__(self, pdf_meta, member):
if USING_LHAPDF:
raise ValueError("PDFFlow should not be instantiated when using LHAPDF")

self._pdf_meta = pdf_meta
self._m = member
self._pdf = None
self._flavors = self._pdf_meta.info["Flavors"]

@cached_property
def pdf(self):
# Don't import PDF Flow until you really needed it
import pdfflow

if self._pdf is None:
pdf_def = f"{self._pdf_meta.name}/{self._m}"
self._pdf = pdfflow.mkPDF(pdf_def, self._pdf_meta.path.parent)
return self._pdf

def flavors(self):
return self._flavors

def _xfxQ_all_pid(self, x, q):
x = np.atleast_1d(x)
q = np.atleast_1d(q)

res = self.pdf.py_xfxQ2_allpid(x, q**2).numpy()
return dict(zip(self._flavors, res.T))

def xfxQ(self, a, b, c=None):
"""Wrapper for the LHAPDF xfxQ function
This is an overloaded function in LHAPDF so depending
on the number of arguments we will do:
xfxQ(flavours, x, Q)
or
xfxQ(x, q)
All of x/q/flavours can be either a scalar or an array
"""
if c is None:
return self._xfxQ_all_pid(a, b)

# PDFFlow doesn't allow to ask for flavours that do not exist
# so let us retrieve all and return 0s for non existing flavs
ret_dict = self.xfxQ(b, c)
zeros = np.zeros_like(b)

if isinstance(a, int):
return ret_dict.get(a, zeros)
return [ret_dict.get(i, zeros) for i in a]

def xfxQ2(self, a, b, c=None):
"""Wrapper for LHAPDF xfxQ2 function, like xfxQ for Q2"""
if c is None:
return self.xfxQ(a, np.sqrt(b))
return self.xfxQ(a, b, np.sqrt(c))


def make_pdf(pdf_name, member=None):
"""Load a PDF
if member is given, load the single member otherwise, load the entire set as a list
if LHAPDF is provided, it returns LHAPDF PDF instances
otherwise it returns and object which is _compatible_ with LHAPDF
for lhapdf functions for the selected backend
Parameters:
----------
pdf_name: str
name of the PDF to load
member: int
index of the member of the PDF to load
Returns:
-------
list(pdf_sets)
"""
if USING_LHAPDF:
if member is None:
return lhapdf.mkPDFs(pdf_name)
return [lhapdf.mkPDF(pdf_name, member)]

pdf_meta = lhapdf.load_pdf_meta(pdf_name)
if member is None:
return [_PDFFlowPDF(pdf_meta, m) for m in range(len(pdf_meta))]
return [_PDFFlowPDF(pdf_meta, member)]
7 changes: 4 additions & 3 deletions validphys2/src/validphys/lhapdfset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@
"""
import logging

import lhapdf
import numpy as np

from validphys.lhapdf_compatibility import make_pdf

log = logging.getLogger(__name__)


Expand All @@ -46,9 +47,9 @@ def __init__(self, name, error_type):
self._error_type = error_type
if self.is_t0:
# If at this point we already know this is a T0 set, load only the CV
self._lhapdf_set = [lhapdf.mkPDF(name)]
self._lhapdf_set = make_pdf(name, 0)
else:
self._lhapdf_set = lhapdf.mkPDFs(name)
self._lhapdf_set = make_pdf(name)
self._flavors = None

@property
Expand Down
22 changes: 6 additions & 16 deletions validphys2/src/validphys/lhio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import pathlib
import shutil

import lhapdf
import numpy as np
import pandas as pd

Expand Down Expand Up @@ -137,9 +136,7 @@ def big_matrix(gridlist):
and the central value"""
central_value = gridlist[0]
X = pd.concat(
gridlist[1:],
axis=1,
keys=range(1, len(gridlist) + 1), # avoid confusion with rep0
gridlist[1:], axis=1, keys=range(1, len(gridlist) + 1) # avoid confusion with rep0
).subtract(central_value, axis=0)
if np.any(X.isnull()) or X.shape[0] != len(central_value):
raise ValueError("Incompatible grid specifications")
Expand All @@ -148,11 +145,7 @@ def big_matrix(gridlist):

def rep_matrix(gridlist):
"""Return a properly indexes matrix of all the members"""
X = pd.concat(
gridlist,
axis=1,
keys=range(1, len(gridlist) + 1), # avoid confusion with rep0
)
X = pd.concat(gridlist, axis=1, keys=range(1, len(gridlist) + 1)) # avoid confusion with rep0
if np.ravel(pd.isnull(X)).any():
raise ValueError("Found null values in grid")
return X
Expand Down Expand Up @@ -239,6 +232,7 @@ def new_pdf_from_indexes(
files directly. It is slower and will call LHAPDF to fill the grids,
but works for sets where the replicas have different grids.
"""
import lhapdf

if extra_fields is not None:
raise NotImplementedError()
Expand Down Expand Up @@ -303,7 +297,7 @@ def hessian_from_lincomb(pdf, V, set_name=None, folder=None, extra_fields=None):
# preparing output folder
neig = V.shape[1]

base = pathlib.Path(lhapdf.paths()[-1]) / pdf.name
base = pathlib.Path(lhaindex.get_lha_datapath()) / pdf.name
if set_name is None:
set_name = pdf.name + "_hessian_" + str(neig)
if folder is None:
Expand All @@ -314,8 +308,7 @@ def hessian_from_lincomb(pdf, V, set_name=None, folder=None, extra_fields=None):
if os.path.exists(set_root):
shutil.rmtree(set_root)
log.warning(
"Target directory for new PDF, %s, already exists. Removing contents.",
set_root,
"Target directory for new PDF, %s, already exists. Removing contents.", set_root
)
os.makedirs(os.path.join(set_root))

Expand All @@ -336,10 +329,7 @@ def hessian_from_lincomb(pdf, V, set_name=None, folder=None, extra_fields=None):
yaml.dump(extra_fields, out, default_flow_style=False)

_headers, grids = load_all_replicas(pdf)
result = (big_matrix(grids).dot(V)).add(
grids[0],
axis=0,
)
result = (big_matrix(grids).dot(V)).add(grids[0], axis=0)
hess_header = b"PdfType: error\nFormat: lhagrid1\n"
for column in result.columns:
write_replica(column + 1, set_root, hess_header, result[column])
Expand Down

0 comments on commit 62556ce

Please sign in to comment.