Merge pull request #2516 from devitocodes/async-loads-final-2

compiler: Misc improvements to code generation
devitocodes · Jan 28, 2025 · 82cdb29 · 82cdb29
2 parents f71764a + b8de9ec
commit 82cdb29
Show file tree

Hide file tree

Showing 14 changed files with 333 additions and 131 deletions.
diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
@@ -31,6 +31,8 @@
            'POWER8', 'POWER9',
            # Generic GPUs
            'AMDGPUX', 'NVIDIAX', 'INTELGPUX',
+           # Nvidia GPUs
+           'VOLTA', 'AMPERE', 'HOPPER', 'BLACKWELL',
            # Intel GPUs
            'PVC', 'INTELGPUMAX', 'MAX1100', 'MAX1550']
 
@@ -867,6 +869,12 @@ def limits(self, compiler=None, language=None):
             'max-block-dims': 3,
         }
 
+    def supports(self, query, language=None):
+        """
+        Check if the device supports a given feature.
+        """
+        return False
+
 
 class IntelDevice(Device):
 
@@ -895,6 +903,52 @@ def march(self):
                 return 'tesla'
         return None
 
+    def supports(self, query, language=None):
+        if language != 'cuda':
+            return False
+
+        cc = get_nvidia_cc()
+        if query == 'async-loads' and cc >= 80:
+            # Asynchronous pipeline loads -- introduced in Ampere
+            return True
+        elif query == 'tma' and cc >= 90:
+            # Tensor Memory Accelerator -- introduced in Hopper
+            return True
+        else:
+            return False
+
+
+class Volta(NvidiaDevice):
+    pass
+
+
+class Ampere(Volta):
+
+    def supports(self, query, language=None):
+        if language != 'cuda':
+            return False
+
+        if query == 'async-loads':
+            return True
+
+        return super().supports(query, language)
+
+
+class Hopper(Ampere):
+
+    def supports(self, query, language=None):
+        if language != 'cuda':
+            return False
+
+        if query == 'tma':
+            return True
+
+        return super().supports(query, language)
+
+
+class Blackwell(Hopper):
+    pass
+
 
 class AmdDevice(Device):
 
@@ -963,6 +1017,10 @@ def march(cls):
 ANYGPU = Cpu64('gpu')
 
 NVIDIAX = NvidiaDevice('nvidiaX')
+VOLTA = Volta('volta')
+AMPERE = Ampere('ampere')
+HOPPER = Hopper('hopper')
+BLACKWELL = Blackwell('blackwell')
 
 AMDGPUX = AmdDevice('amdgpuX')
 

diff --git a/devito/arch/compiler.py b/devito/arch/compiler.py
@@ -13,8 +13,8 @@
 from codepy.toolchain import (GCCToolchain,
                               call_capture_output as _call_capture_output)
 
-from devito.arch import (AMDGPUX, Cpu64, AppleArm, NVIDIAX, POWER8, POWER9, Graviton,
-                         IntelDevice, get_nvidia_cc, check_cuda_runtime,
+from devito.arch import (AMDGPUX, Cpu64, AppleArm, NvidiaDevice, POWER8, POWER9,
+                         Graviton, IntelDevice, get_nvidia_cc, check_cuda_runtime,
                          get_m1_llvm_path)
 from devito.exceptions import CompilationError
 from devito.logger import debug, warning
@@ -487,7 +487,7 @@ def __init_finalize__(self, **kwargs):
         language = kwargs.pop('language', configuration['language'])
         platform = kwargs.pop('platform', configuration['platform'])
 
-        if platform is NVIDIAX:
+        if isinstance(platform, NvidiaDevice):
             self.cflags.remove('-std=c99')
             # Add flags for OpenMP offloading
             if language in ['C', 'openmp']:
@@ -555,7 +555,7 @@ def __init_finalize__(self, **kwargs):
         if not configuration['safe-math']:
             self.cflags.append('-ffast-math')
 
-        if platform is NVIDIAX:
+        if isinstance(platform, NvidiaDevice):
             self.cflags.remove('-std=c99')
         elif platform is AMDGPUX:
             self.cflags.remove('-std=c99')
@@ -607,7 +607,7 @@ def __init_finalize__(self, **kwargs):
         language = kwargs.pop('language', configuration['language'])
         platform = kwargs.pop('platform', configuration['platform'])
 
-        if platform is NVIDIAX:
+        if isinstance(platform, NvidiaDevice):
             if self.version >= Version("24.9"):
                 self.cflags.append('-gpu=mem:separate:pinnedalloc')
             else:
@@ -843,7 +843,7 @@ def __init_finalize__(self, **kwargs):
                 self.ldflags.remove('-qopenmp')
                 self.ldflags.append('-fopenmp')
 
-            if platform is NVIDIAX:
+            if isinstance(platform, NvidiaDevice):
                 self.cflags.append('-fopenmp-targets=nvptx64-cuda')
             elif isinstance(platform, IntelDevice):
                 self.cflags.append('-fiopenmp')
@@ -900,7 +900,7 @@ def __init_finalize__(self, **kwargs):
 
         if isinstance(platform, Cpu64):
             pass
-        elif platform is NVIDIAX:
+        elif isinstance(platform, NvidiaDevice):
             self.cflags.append('-fsycl-targets=nvptx64-cuda')
         elif isinstance(platform, IntelDevice):
             self.cflags.append('-fsycl-targets=spir64')
@@ -931,7 +931,7 @@ def __new__(cls, *args, **kwargs):
             _base = ClangCompiler
         elif isinstance(platform, IntelDevice):
             _base = OneapiCompiler
-        elif platform is NVIDIAX:
+        elif isinstance(platform, NvidiaDevice):
             if language == 'cuda':
                 _base = CudaCompiler
             else:

diff --git a/devito/finite_differences/differentiable.py b/devito/finite_differences/differentiable.py
@@ -749,6 +749,12 @@ def __init_finalize__(self, *args, **kwargs):
 
         super().__init_finalize__(*args, **kwargs)
 
+    @classmethod
+    def class_key(cls):
+        # Ensure Weights appear before any other AbstractFunction
+        p, v, _ = Array.class_key()
+        return p, v - 1, cls.__name__
+
     def __eq__(self, other):
         return (isinstance(other, Weights) and
                 self.name == other.name and
@@ -838,7 +844,8 @@ def compare(self, other):
         n1 = self.__class__
         n2 = other.__class__
         if n1.__name__ == n2.__name__:
-            return self.base.compare(other.base)
+            return (self.weights.compare(other.weights) or
+                    self.base.compare(other.base))
         else:
             return super().compare(other)
 

diff --git a/devito/ir/clusters/cluster.py b/devito/ir/clusters/cluster.py
@@ -4,16 +4,17 @@
 import numpy as np
 
 from devito.ir.equations import ClusterizedEq
-from devito.ir.support import (PARALLEL, PARALLEL_IF_PVT, BaseGuardBoundNext,
-                               Forward, Interval, IntervalGroup, IterationSpace,
-                               DataSpace, Guards, Properties, Scope, WaitLock,
-                               WithLock, PrefetchUpdate, detect_accesses, detect_io,
-                               normalize_properties, normalize_syncs, minimum,
-                               maximum, null_ispace)
+from devito.ir.support import (
+    PARALLEL, PARALLEL_IF_PVT, BaseGuardBoundNext, Forward, Interval, IntervalGroup,
+    IterationSpace, DataSpace, Guards, Properties, Scope, WaitLock, WithLock,
+    PrefetchUpdate, detect_accesses, detect_io, normalize_properties,
+    tailor_properties, update_properties, normalize_syncs, minimum, maximum,
+    null_ispace
+)
 from devito.mpi.halo_scheme import HaloScheme, HaloTouch
 from devito.mpi.reduction_scheme import DistReduce
 from devito.symbolics import estimate_cost
-from devito.tools import as_tuple, flatten, infer_dtype
+from devito.tools import as_tuple, filter_ordered, flatten, infer_dtype
 from devito.types import Fence, WeakFence, CriticalRegion
 
 __all__ = ["Cluster", "ClusterGroup"]
@@ -52,7 +53,8 @@ def __init__(self, exprs, ispace=null_ispace, guards=None, properties=None,
         self._syncs = normalize_syncs(syncs or {})
 
         properties = Properties(properties or {})
-        self._properties = tailor_properties(properties, ispace)
+        properties = tailor_properties(properties, ispace)
+        self._properties = update_properties(properties, self.exprs)
 
         self._halo_scheme = halo_scheme
 
@@ -482,15 +484,17 @@ def properties(self):
 
     @cached_property
     def guards(self):
-        """The guards of each Cluster in self."""
-        return tuple(i.guards for i in self)
+        """
+        A view of the ClusterGroup's guards.
+        """
+        return tuple(filter_ordered(i.guards for i in self))
 
     @cached_property
     def syncs(self):
         """
         A view of the ClusterGroup's synchronization operations.
         """
-        return normalize_syncs(*[c.syncs for c in self])
+        return normalize_syncs(*[c.syncs for c in self], strict=False)
 
     @cached_property
     def dspace(self):
@@ -540,19 +544,3 @@ def reduce_properties(clusters):
             properties[d] = normalize_properties(properties.get(d, v), v)
 
     return Properties(properties)
-
-
-def tailor_properties(properties, ispace):
-    """
-    Create a new Properties object off `properties` that retains all and only
-    the iteration dimensions in `ispace`.
-    """
-    for i in properties:
-        for d in as_tuple(i):
-            if d not in ispace.itdims:
-                properties = properties.drop(d)
-
-    for d in ispace.itdims:
-        properties = properties.add(d)
-
-    return properties
diff --git a/devito/ir/support/properties.py b/devito/ir/support/properties.py
@@ -86,6 +86,14 @@ def __init__(self, name, val=None):
 """
 
 PREFETCHABLE = Property('prefetchable')
+"""
+A Dimension along which prefetching is feasible and beneficial.
+"""
+
+PREFETCHABLE_SHM = Property('prefetchable-shm')
+"""
+A Dimension along which shared-memory prefetching is feasible and beneficial.
+"""
 
 
 # Bundles
@@ -129,6 +137,62 @@ def relax_properties(properties):
     return frozenset(properties - {PARALLEL_INDEP})
 
 
+def tailor_properties(properties, ispace):
+    """
+    Create a new Properties object off `properties` that retains all and only
+    the iteration dimensions in `ispace`.
+    """
+    for i in properties:
+        for d in as_tuple(i):
+            if d not in ispace.itdims:
+                properties = properties.drop(d)
+
+    for d in ispace.itdims:
+        properties = properties.add(d)
+
+    return properties
+
+
+def update_properties(properties, exprs):
+    """
+    Create a new Properties object off `properties` augmented with properties
+    discovered from `exprs` or with properties removed if they are incompatible
+    with `exprs`.
+    """
+    exprs = as_tuple(exprs)
+
+    if not exprs:
+        return properties
+
+    # Auto-detect prefetchable Dimensions
+    dims = set()
+    flag = False
+    for e in as_tuple(exprs):
+        w, r = e.args
+
+        # Ensure it's in the form `Indexed = Indexed`
+        try:
+            wf, rf = w.function, r.function
+        except AttributeError:
+            break
+
+        if not wf._mem_shared:
+            break
+        dims.update({d.parent for d in wf.dimensions if d.parent in properties})
+
+        if not rf._mem_heap:
+            break
+    else:
+        flag = True
+
+    if flag:
+        properties = properties.prefetchable_shm(dims)
+    else:
+        properties = properties.drop(properties=PREFETCHABLE_SHM)
+
+    return properties
+
+
 class Properties(frozendict):
 
     """
@@ -183,12 +247,15 @@ def sequentialize(self, dims=None):
             m[d] = normalize_properties(set(self.get(d, [])), {SEQUENTIAL})
         return Properties(m)
 
-    def prefetchable(self, dims):
+    def prefetchable(self, dims, v=PREFETCHABLE):
         m = dict(self)
         for d in as_tuple(dims):
-            m[d] = self.get(d, set()) | {PREFETCHABLE}
+            m[d] = self.get(d, set()) | {v}
         return Properties(m)
 
+    def prefetchable_shm(self, dims):
+        return self.prefetchable(dims, PREFETCHABLE_SHM)
+
     def block(self, dims, kind='default'):
         if kind == 'default':
             p = TILABLE
@@ -232,8 +299,13 @@ def is_blockable(self, d):
     def is_blockable_small(self, d):
         return TILABLE_SMALL in self.get(d, set())
 
-    def is_prefetchable(self, dims):
-        return any(PREFETCHABLE in self.get(d, set()) for d in as_tuple(dims))
+    def is_prefetchable(self, dims=None, v=PREFETCHABLE):
+        if dims is None:
+            dims = list(self)
+        return any(v in self.get(d, set()) for d in as_tuple(dims))
+
+    def is_prefetchable_shm(self, dims=None):
+        return self.is_prefetchable(dims, PREFETCHABLE_SHM)
 
     @property
     def nblockable(self):

diff --git a/devito/ir/support/syncs.py b/devito/ir/support/syncs.py
@@ -164,7 +164,7 @@ def update(self, ops):
         return Ops(m)
 
 
-def normalize_syncs(*args):
+def normalize_syncs(*args, strict=True):
     if not args:
         return {}
 
@@ -175,12 +175,13 @@ def normalize_syncs(*args):
 
     syncs = {k: tuple(filter_ordered(v)) for k, v in syncs.items()}
 
-    for v in syncs.values():
-        waitlocks = [s for s in v if isinstance(s, WaitLock)]
-        withlocks = [s for s in v if isinstance(s, WithLock)]
+    if strict:
+        for v in syncs.values():
+            waitlocks = [s for s in v if isinstance(s, WaitLock)]
+            withlocks = [s for s in v if isinstance(s, WithLock)]
 
-        if waitlocks and withlocks:
-            # We do not allow mixing up WaitLock and WithLock ops
-            raise ValueError("Incompatible SyncOps")
+            if waitlocks and withlocks:
+                # We do not allow mixing up WaitLock and WithLock ops
+                raise ValueError("Incompatible SyncOps")
 
     return Ops(syncs)