Skip to content

Commit

Permalink
Merge pull request #2516 from devitocodes/async-loads-final-2
Browse files Browse the repository at this point in the history
compiler: Misc improvements to code generation
  • Loading branch information
FabioLuporini authored Jan 28, 2025
2 parents f71764a + b8de9ec commit 82cdb29
Show file tree
Hide file tree
Showing 14 changed files with 333 additions and 131 deletions.
58 changes: 58 additions & 0 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
'POWER8', 'POWER9',
# Generic GPUs
'AMDGPUX', 'NVIDIAX', 'INTELGPUX',
# Nvidia GPUs
'VOLTA', 'AMPERE', 'HOPPER', 'BLACKWELL',
# Intel GPUs
'PVC', 'INTELGPUMAX', 'MAX1100', 'MAX1550']

Expand Down Expand Up @@ -867,6 +869,12 @@ def limits(self, compiler=None, language=None):
'max-block-dims': 3,
}

def supports(self, query, language=None):
"""
Check if the device supports a given feature.
"""
return False


class IntelDevice(Device):

Expand Down Expand Up @@ -895,6 +903,52 @@ def march(self):
return 'tesla'
return None

def supports(self, query, language=None):
if language != 'cuda':
return False

cc = get_nvidia_cc()
if query == 'async-loads' and cc >= 80:
# Asynchronous pipeline loads -- introduced in Ampere
return True
elif query == 'tma' and cc >= 90:
# Tensor Memory Accelerator -- introduced in Hopper
return True
else:
return False


class Volta(NvidiaDevice):
pass


class Ampere(Volta):

def supports(self, query, language=None):
if language != 'cuda':
return False

if query == 'async-loads':
return True

return super().supports(query, language)


class Hopper(Ampere):

def supports(self, query, language=None):
if language != 'cuda':
return False

if query == 'tma':
return True

return super().supports(query, language)


class Blackwell(Hopper):
pass


class AmdDevice(Device):

Expand Down Expand Up @@ -963,6 +1017,10 @@ def march(cls):
ANYGPU = Cpu64('gpu')

NVIDIAX = NvidiaDevice('nvidiaX')
VOLTA = Volta('volta')
AMPERE = Ampere('ampere')
HOPPER = Hopper('hopper')
BLACKWELL = Blackwell('blackwell')

AMDGPUX = AmdDevice('amdgpuX')

Expand Down
16 changes: 8 additions & 8 deletions devito/arch/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from codepy.toolchain import (GCCToolchain,
call_capture_output as _call_capture_output)

from devito.arch import (AMDGPUX, Cpu64, AppleArm, NVIDIAX, POWER8, POWER9, Graviton,
IntelDevice, get_nvidia_cc, check_cuda_runtime,
from devito.arch import (AMDGPUX, Cpu64, AppleArm, NvidiaDevice, POWER8, POWER9,
Graviton, IntelDevice, get_nvidia_cc, check_cuda_runtime,
get_m1_llvm_path)
from devito.exceptions import CompilationError
from devito.logger import debug, warning
Expand Down Expand Up @@ -487,7 +487,7 @@ def __init_finalize__(self, **kwargs):
language = kwargs.pop('language', configuration['language'])
platform = kwargs.pop('platform', configuration['platform'])

if platform is NVIDIAX:
if isinstance(platform, NvidiaDevice):
self.cflags.remove('-std=c99')
# Add flags for OpenMP offloading
if language in ['C', 'openmp']:
Expand Down Expand Up @@ -555,7 +555,7 @@ def __init_finalize__(self, **kwargs):
if not configuration['safe-math']:
self.cflags.append('-ffast-math')

if platform is NVIDIAX:
if isinstance(platform, NvidiaDevice):
self.cflags.remove('-std=c99')
elif platform is AMDGPUX:
self.cflags.remove('-std=c99')
Expand Down Expand Up @@ -607,7 +607,7 @@ def __init_finalize__(self, **kwargs):
language = kwargs.pop('language', configuration['language'])
platform = kwargs.pop('platform', configuration['platform'])

if platform is NVIDIAX:
if isinstance(platform, NvidiaDevice):
if self.version >= Version("24.9"):
self.cflags.append('-gpu=mem:separate:pinnedalloc')
else:
Expand Down Expand Up @@ -843,7 +843,7 @@ def __init_finalize__(self, **kwargs):
self.ldflags.remove('-qopenmp')
self.ldflags.append('-fopenmp')

if platform is NVIDIAX:
if isinstance(platform, NvidiaDevice):
self.cflags.append('-fopenmp-targets=nvptx64-cuda')
elif isinstance(platform, IntelDevice):
self.cflags.append('-fiopenmp')
Expand Down Expand Up @@ -900,7 +900,7 @@ def __init_finalize__(self, **kwargs):

if isinstance(platform, Cpu64):
pass
elif platform is NVIDIAX:
elif isinstance(platform, NvidiaDevice):
self.cflags.append('-fsycl-targets=nvptx64-cuda')
elif isinstance(platform, IntelDevice):
self.cflags.append('-fsycl-targets=spir64')
Expand Down Expand Up @@ -931,7 +931,7 @@ def __new__(cls, *args, **kwargs):
_base = ClangCompiler
elif isinstance(platform, IntelDevice):
_base = OneapiCompiler
elif platform is NVIDIAX:
elif isinstance(platform, NvidiaDevice):
if language == 'cuda':
_base = CudaCompiler
else:
Expand Down
9 changes: 8 additions & 1 deletion devito/finite_differences/differentiable.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,12 @@ def __init_finalize__(self, *args, **kwargs):

super().__init_finalize__(*args, **kwargs)

@classmethod
def class_key(cls):
# Ensure Weights appear before any other AbstractFunction
p, v, _ = Array.class_key()
return p, v - 1, cls.__name__

def __eq__(self, other):
return (isinstance(other, Weights) and
self.name == other.name and
Expand Down Expand Up @@ -838,7 +844,8 @@ def compare(self, other):
n1 = self.__class__
n2 = other.__class__
if n1.__name__ == n2.__name__:
return self.base.compare(other.base)
return (self.weights.compare(other.weights) or
self.base.compare(other.base))
else:
return super().compare(other)

Expand Down
42 changes: 15 additions & 27 deletions devito/ir/clusters/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
import numpy as np

from devito.ir.equations import ClusterizedEq
from devito.ir.support import (PARALLEL, PARALLEL_IF_PVT, BaseGuardBoundNext,
Forward, Interval, IntervalGroup, IterationSpace,
DataSpace, Guards, Properties, Scope, WaitLock,
WithLock, PrefetchUpdate, detect_accesses, detect_io,
normalize_properties, normalize_syncs, minimum,
maximum, null_ispace)
from devito.ir.support import (
PARALLEL, PARALLEL_IF_PVT, BaseGuardBoundNext, Forward, Interval, IntervalGroup,
IterationSpace, DataSpace, Guards, Properties, Scope, WaitLock, WithLock,
PrefetchUpdate, detect_accesses, detect_io, normalize_properties,
tailor_properties, update_properties, normalize_syncs, minimum, maximum,
null_ispace
)
from devito.mpi.halo_scheme import HaloScheme, HaloTouch
from devito.mpi.reduction_scheme import DistReduce
from devito.symbolics import estimate_cost
from devito.tools import as_tuple, flatten, infer_dtype
from devito.tools import as_tuple, filter_ordered, flatten, infer_dtype
from devito.types import Fence, WeakFence, CriticalRegion

__all__ = ["Cluster", "ClusterGroup"]
Expand Down Expand Up @@ -52,7 +53,8 @@ def __init__(self, exprs, ispace=null_ispace, guards=None, properties=None,
self._syncs = normalize_syncs(syncs or {})

properties = Properties(properties or {})
self._properties = tailor_properties(properties, ispace)
properties = tailor_properties(properties, ispace)
self._properties = update_properties(properties, self.exprs)

self._halo_scheme = halo_scheme

Expand Down Expand Up @@ -482,15 +484,17 @@ def properties(self):

@cached_property
def guards(self):
"""The guards of each Cluster in self."""
return tuple(i.guards for i in self)
"""
A view of the ClusterGroup's guards.
"""
return tuple(filter_ordered(i.guards for i in self))

@cached_property
def syncs(self):
"""
A view of the ClusterGroup's synchronization operations.
"""
return normalize_syncs(*[c.syncs for c in self])
return normalize_syncs(*[c.syncs for c in self], strict=False)

@cached_property
def dspace(self):
Expand Down Expand Up @@ -540,19 +544,3 @@ def reduce_properties(clusters):
properties[d] = normalize_properties(properties.get(d, v), v)

return Properties(properties)


def tailor_properties(properties, ispace):
"""
Create a new Properties object off `properties` that retains all and only
the iteration dimensions in `ispace`.
"""
for i in properties:
for d in as_tuple(i):
if d not in ispace.itdims:
properties = properties.drop(d)

for d in ispace.itdims:
properties = properties.add(d)

return properties
80 changes: 76 additions & 4 deletions devito/ir/support/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,14 @@ def __init__(self, name, val=None):
"""

PREFETCHABLE = Property('prefetchable')
"""
A Dimension along which prefetching is feasible and beneficial.
"""

PREFETCHABLE_SHM = Property('prefetchable-shm')
"""
A Dimension along which shared-memory prefetching is feasible and beneficial.
"""


# Bundles
Expand Down Expand Up @@ -129,6 +137,62 @@ def relax_properties(properties):
return frozenset(properties - {PARALLEL_INDEP})


def tailor_properties(properties, ispace):
"""
Create a new Properties object off `properties` that retains all and only
the iteration dimensions in `ispace`.
"""
for i in properties:
for d in as_tuple(i):
if d not in ispace.itdims:
properties = properties.drop(d)

for d in ispace.itdims:
properties = properties.add(d)

return properties


def update_properties(properties, exprs):
"""
Create a new Properties object off `properties` augmented with properties
discovered from `exprs` or with properties removed if they are incompatible
with `exprs`.
"""
exprs = as_tuple(exprs)

if not exprs:
return properties

# Auto-detect prefetchable Dimensions
dims = set()
flag = False
for e in as_tuple(exprs):
w, r = e.args

# Ensure it's in the form `Indexed = Indexed`
try:
wf, rf = w.function, r.function
except AttributeError:
break

if not wf._mem_shared:
break
dims.update({d.parent for d in wf.dimensions if d.parent in properties})

if not rf._mem_heap:
break
else:
flag = True

if flag:
properties = properties.prefetchable_shm(dims)
else:
properties = properties.drop(properties=PREFETCHABLE_SHM)

return properties


class Properties(frozendict):

"""
Expand Down Expand Up @@ -183,12 +247,15 @@ def sequentialize(self, dims=None):
m[d] = normalize_properties(set(self.get(d, [])), {SEQUENTIAL})
return Properties(m)

def prefetchable(self, dims):
def prefetchable(self, dims, v=PREFETCHABLE):
m = dict(self)
for d in as_tuple(dims):
m[d] = self.get(d, set()) | {PREFETCHABLE}
m[d] = self.get(d, set()) | {v}
return Properties(m)

def prefetchable_shm(self, dims):
return self.prefetchable(dims, PREFETCHABLE_SHM)

def block(self, dims, kind='default'):
if kind == 'default':
p = TILABLE
Expand Down Expand Up @@ -232,8 +299,13 @@ def is_blockable(self, d):
def is_blockable_small(self, d):
return TILABLE_SMALL in self.get(d, set())

def is_prefetchable(self, dims):
return any(PREFETCHABLE in self.get(d, set()) for d in as_tuple(dims))
def is_prefetchable(self, dims=None, v=PREFETCHABLE):
if dims is None:
dims = list(self)
return any(v in self.get(d, set()) for d in as_tuple(dims))

def is_prefetchable_shm(self, dims=None):
return self.is_prefetchable(dims, PREFETCHABLE_SHM)

@property
def nblockable(self):
Expand Down
15 changes: 8 additions & 7 deletions devito/ir/support/syncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def update(self, ops):
return Ops(m)


def normalize_syncs(*args):
def normalize_syncs(*args, strict=True):
if not args:
return {}

Expand All @@ -175,12 +175,13 @@ def normalize_syncs(*args):

syncs = {k: tuple(filter_ordered(v)) for k, v in syncs.items()}

for v in syncs.values():
waitlocks = [s for s in v if isinstance(s, WaitLock)]
withlocks = [s for s in v if isinstance(s, WithLock)]
if strict:
for v in syncs.values():
waitlocks = [s for s in v if isinstance(s, WaitLock)]
withlocks = [s for s in v if isinstance(s, WithLock)]

if waitlocks and withlocks:
# We do not allow mixing up WaitLock and WithLock ops
raise ValueError("Incompatible SyncOps")
if waitlocks and withlocks:
# We do not allow mixing up WaitLock and WithLock ops
raise ValueError("Incompatible SyncOps")

return Ops(syncs)
Loading

0 comments on commit 82cdb29

Please sign in to comment.