From d8ef62c018f9605ddbd3abc6b07979109a630182 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 28 Dec 2020 13:41:24 -0600
Subject: [PATCH 001/107] Start editing benchmark

---
 tests/benchmark/pingpong.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/tests/benchmark/pingpong.py b/tests/benchmark/pingpong.py
index cb6060c2..cb367f81 100644
--- a/tests/benchmark/pingpong.py
+++ b/tests/benchmark/pingpong.py
@@ -1,6 +1,7 @@
 from charm4py import charm, Chare, Array, coro, Future
 from time import time
-#import numpy as np
+import numpy as np
+from numba import cuda
 
 PAYLOAD = 100  # number of bytes
 NITER = 10000
@@ -15,11 +16,15 @@ def __init__(self):
         else:
             self.neighbor = self.thisProxy[0]
 
-    def start(self, done_future, threaded=False):
+    def start(self, done_future, threaded=False, gpu=False):
+        assert threaded ^ gpu
         self.done_future = done_future
         self.iter = 0
-        #data = np.zeros(PAYLOAD, dtype='int8')
-        data = 3
+        if not gpu:
+            data = np.zeros(PAYLOAD, dtype='int8')
+        else:
+            pass
+        # data = 3
         self.startTime = time()
         if threaded:
             self.neighbor.recv_th(data)
@@ -50,11 +55,13 @@ def main(args):
     threaded = False
     if len(args) > 1 and args[1] == '-t':
         threaded = True
+    elif len(args) >1 and args[1] == '--gpu':
+        gpu = True
     pings = Array(Ping, 2)
     charm.awaitCreation(pings)
     for _ in range(2):
         done_future = Future()
-        pings[0].start(done_future, threaded)
+        pings[0].start(done_future, threaded, gpu)
         totalTime = done_future.get()
         print("ping pong time per iter (us)=", totalTime / NITER * 1000000)
     exit()

From 6804a96137325ac03133338e073b716525721f68 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 30 Dec 2020 10:21:49 -0500
Subject: [PATCH 002/107] Creation of standalone GPU pingpong file

---
 tests/benchmark/pingpong_gpu.py | 82 +++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 tests/benchmark/pingpong_gpu.py

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
new file mode 100644
index 00000000..2c304456
--- /dev/null
+++ b/tests/benchmark/pingpong_gpu.py
@@ -0,0 +1,82 @@
+from charm4py import charm, Chare, Array, coro, Future
+from time import time
+import numpy as np
+from numba import cuda
+
+PAYLOAD = 100  # number of bytes
+NITER = 10000
+
+
+class Ping(Chare):
+
+    def __init__(self, gpu, num_iters):
+        self.gpu = gpu
+        self.myIndex = self.thisIndex[0]
+        if self.myIndex == 0:
+            self.neighbor = self.thisProxy[1]
+        else:
+            self.neighbor = self.thisProxy[0]
+
+    def start(self, done_future, payload_size):
+        self.done_future = done_future
+        self.iter = 0
+        data = np.zeros(payload_size, dtype='int8')
+        if self.gpu:
+            data = cuda.to_device(data)
+            self.startTime = time()
+
+        else:
+            self.neighbor.recv(data)
+
+    def recv(self, data):
+        data = cuda.to_device(data)
+        if self.myIndex == 0:
+            self.iter += 1
+            if self.iter == NITER:
+                totalTime = time() - self.startTime
+                self.done_future.send(totalTime)
+                return
+        data = data.copy_to_host()
+        self.neighbor.recv(data)
+
+    @coro
+    def recv_th(self, data):
+        if self.myIndex == 0:
+            self.iter += 1
+            if self.iter == NITER:
+                totalTime = time() - self.startTime
+                self.done_future.send(totalTime)
+                return
+        self.neighbor.recv_th(data)
+
+
+def main(args):
+    threaded = False
+    gpu = False
+    min_msg_size, max_mig_size, low_iter, high_iter, printFormat, gpu = 0
+    if len(args) < 7:
+        print("Doesn't have the required input params. Usage:"
+              "<max-msg-size> <low-iter> <high-iter> <print-format"
+              "(0 for csv, 1 for "
+              "regular)> <GPU (0 for CPU, 1 for GPU)>\n"
+              )
+        charm.exit(-1)
+
+    min_msg_size = int(args[1])
+    max_msg_size = int(args[2])
+    low_iter = int(args[3])
+    high_iter = int(args[4])
+    print_format = int(args[5])
+    gpu = int(args[6])
+
+    pings = Array(Ping, 2, gpu)
+    charm.awaitCreation(pings)
+    for _ in range(2):
+        done_future = Future()
+        pings[0].start(done_future, threaded, gpu)
+        totalTime = done_future.get()
+        print("ping pong time per iter (us)=", totalTime / NITER * 1000000)
+    exit()
+
+
+charm.start(main)

From 28203e145b3b44982eab572718f61cc4c7ad17f2 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 30 Dec 2020 12:07:24 -0500
Subject: [PATCH 003/107] Added CPU-only pingpong

---
 tests/benchmark/pingpong_gpu.py | 123 ++++++++++++++++++--------------
 1 file changed, 71 insertions(+), 52 deletions(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 2c304456..b2c067ea 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -1,62 +1,74 @@
-from charm4py import charm, Chare, Array, coro, Future
-from time import time
+from charm4py import charm, Chare, Array, coro, Future, Channel, Group
+import time
 import numpy as np
 from numba import cuda
 
-PAYLOAD = 100  # number of bytes
-NITER = 10000
+class Ping(Chare):
 
+    #TODO: How do we determine how many chares?
+    def __init__(self, use_gpudirect, print_format):
+        self.gpu_direct = use_gpudirect
+        self.num_chares = charm.numPes()
+        self.print_format = print_format
+        # self.am_low_chare = self.thisIndex < self.num_chares // 2
+        self.am_low_chare = self.thisIndex == 0
 
-class Ping(Chare):
+        if self.am_low_chare:
+            print("Msg Size, Iterations, One-way Time (us), Bandwidth (bytes/us)")
 
-    def __init__(self, gpu, num_iters):
-        self.gpu = gpu
-        self.myIndex = self.thisIndex[0]
-        if self.myIndex == 0:
-            self.neighbor = self.thisProxy[1]
-        else:
-            self.neighbor = self.thisProxy[0]
+    @coro
+    def do_iteration(self, message_size, num_iters, done_future):
+        # TODO: How do we allocate device data again?
+        # dev_array = cuda.zeros(on_device)
+        # host_data = cuda.zeros(on_host)
+        data = np.zeros(message_size, dtype='int8')
+        partner_idx = int(not self.thisIndex)
+        # partner = self.thisProxy[self.thisIndex + self.num_chares // 2]
+        partner = self.thisProxy[partner_idx]
+        partner_channel = Channel(self, partner)
 
-    def start(self, done_future, payload_size):
-        self.done_future = done_future
-        self.iter = 0
-        data = np.zeros(payload_size, dtype='int8')
-        if self.gpu:
-            data = cuda.to_device(data)
-            self.startTime = time()
+        tstart = time.time()
 
-        else:
-            self.neighbor.recv(data)
-
-    def recv(self, data):
-        data = cuda.to_device(data)
-        if self.myIndex == 0:
-            self.iter += 1
-            if self.iter == NITER:
-                totalTime = time() - self.startTime
-                self.done_future.send(totalTime)
-                return
-        data = data.copy_to_host()
-        self.neighbor.recv(data)
 
-    @coro
-    def recv_th(self, data):
-        if self.myIndex == 0:
-            self.iter += 1
-            if self.iter == NITER:
-                totalTime = time() - self.startTime
-                self.done_future.send(totalTime)
-                return
-        self.neighbor.recv_th(data)
+        for _ in range(num_iters):
+            if self.am_low_chare:
+                if not self.gpu_direct:
+                    dev_array = 0 # copy the device array to memory on device, TODO: use pinned memory?
+                # partner_channel.send(dev_array)
+                partner_channel.send(data)
+                partner_channel.recv()
+
+            # if not self.gpu_direct:
+            else:
+                partner_channel.recv()
+                partner_channel.send(data)
+                # copy the data back to the device
+
+        # TODO: should we have barrier (reduction) here?
+        tend = time.time()
 
+        elapsed_time = tend - tstart
+
+        if self.am_low_chare:
+            # display data here
+            self.display_iteration_data(elapsed_time, num_iters, message_size)
+
+        self.reduce(done_future)
+
+    def display_iteration_data(self, elapsed_time, num_iters, message_size):
+        elapsed_time /= 2 # 1-way performance, not RTT
+        elapsed_time /= num_iters # Time for each message
+        bandwidth = message_size / elapsed_time
+        if self.print_format == 0:
+            print(f'{message_size},{num_iters},{elapsed_time * 1e6},{bandwidth / 1e6}')
+        else:
+            print('Not implemented!')
 
 def main(args):
-    threaded = False
-    gpu = False
-    min_msg_size, max_mig_size, low_iter, high_iter, printFormat, gpu = 0
     if len(args) < 7:
         print("Doesn't have the required input params. Usage:"
-              "<max-msg-size> <low-iter> <high-iter> <print-format"
+              "<min-msg-size> <max-msg-size> <low-iter> "
+              "<high-iter> <print-format"
               "(0 for csv, 1 for "
               "regular)> <GPU (0 for CPU, 1 for GPU)>\n"
               )
@@ -67,16 +79,23 @@ def main(args):
     low_iter = int(args[3])
     high_iter = int(args[4])
     print_format = int(args[5])
-    gpu = int(args[6])
+    use_gpudirect = int(args[6])
 
-    pings = Array(Ping, 2, gpu)
+    pings = Group(Ping, args=[use_gpudirect, print_format])
     charm.awaitCreation(pings)
-    for _ in range(2):
+    msg_size = min_msg_size
+
+    while msg_size <= max_msg_size:
+        if msg_size <= 1048576:
+            iter = low_iter
+        else:
+            iter = high_iter
         done_future = Future()
-        pings[0].start(done_future, threaded, gpu)
-        totalTime = done_future.get()
-        print("ping pong time per iter (us)=", totalTime / NITER * 1000000)
-    exit()
+        pings.do_iteration(msg_size, iter, done_future)
+        done_future.get()
+        msg_size *= 2
+
+    charm.exit()
 
 
 charm.start(main)

From 681129cd918f8eed6f71d87e3ba0af8d9024738c Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 30 Dec 2020 13:02:54 -0500
Subject: [PATCH 004/107] Added correct print format, pinned memory now used
 for staging host memory

---
 tests/benchmark/pingpong_gpu.py | 60 ++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index b2c067ea..9ade4ecb 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -3,66 +3,78 @@
 import numpy as np
 from numba import cuda
 
-class Ping(Chare):
+USE_PINNED = True
 
-    #TODO: How do we determine how many chares?
+class Ping(Chare):
     def __init__(self, use_gpudirect, print_format):
         self.gpu_direct = use_gpudirect
         self.num_chares = charm.numPes()
         self.print_format = print_format
-        # self.am_low_chare = self.thisIndex < self.num_chares // 2
         self.am_low_chare = self.thisIndex == 0
 
         if self.am_low_chare:
-            print("Msg Size, Iterations, One-way Time (us), Bandwidth (bytes/us)")
+            if print_format == 0:
+                print("Msg Size, Iterations, One-way Time (us), "
+                      "Bandwidth (bytes/us)"
+                      )
+            else:
+                print(f'{"Msg Size": <30} {"Iterations": <25} '
+                      f'{"One-way Time (us)": <20} {"Bandwidth (bytes/us)": <20}'
+                      )
 
     @coro
     def do_iteration(self, message_size, num_iters, done_future):
-        # TODO: How do we allocate device data again?
-        # dev_array = cuda.zeros(on_device)
-        # host_data = cuda.zeros(on_host)
-        data = np.zeros(message_size, dtype='int8')
+        if USE_PINNED:
+            h_data = cuda.pinned_array(message_size, dtype='int8')
+        else:
+            h_data = np.zeros(message_size, dtype='int8')
+        d_data = cuda.device_array(message_size, dtype='int8')
+        d_data.copy_to_device(h_data)
         partner_idx = int(not self.thisIndex)
-        # partner = self.thisProxy[self.thisIndex + self.num_chares // 2]
         partner = self.thisProxy[partner_idx]
         partner_channel = Channel(self, partner)
 
         tstart = time.time()
 
-
         for _ in range(num_iters):
             if self.am_low_chare:
                 if not self.gpu_direct:
-                    dev_array = 0 # copy the device array to memory on device, TODO: use pinned memory?
-                # partner_channel.send(dev_array)
-                partner_channel.send(data)
-                partner_channel.recv()
+                    d_data.copy_to_host(h_data)
+                    # partner_channel.send(dev_array)
+                    partner_channel.send(h_data)
+                    d_data.copy_to_device(partner_channel.recv())
+                else:
+                    raise NotImplementedError("TODO: GPU Direct")
 
-            # if not self.gpu_direct:
             else:
-                partner_channel.recv()
-                partner_channel.send(data)
-                # copy the data back to the device
+                if not self.gpu_direct:
+                    d_data.copy_to_device(partner_channel.recv())
+                    d_data.copy_to_host(h_data)
+                    partner_channel.send(h_data)
+                else:
+                    raise NotImplementedError("TODO: GPU Direct")
 
-        # TODO: should we have barrier (reduction) here?
         tend = time.time()
 
         elapsed_time = tend - tstart
 
         if self.am_low_chare:
-            # display data here
             self.display_iteration_data(elapsed_time, num_iters, message_size)
 
         self.reduce(done_future)
 
     def display_iteration_data(self, elapsed_time, num_iters, message_size):
-        elapsed_time /= 2 # 1-way performance, not RTT
-        elapsed_time /= num_iters # Time for each message
+        elapsed_time /= 2  # 1-way performance, not RTT
+        elapsed_time /= num_iters  # Time for each message
         bandwidth = message_size / elapsed_time
         if self.print_format == 0:
-            print(f'{message_size},{num_iters},{elapsed_time * 1e6},{bandwidth / 1e6}')
+            print(f'{message_size},{num_iters},{elapsed_time * 1e6},'
+                  f'{bandwidth / 1e6}'
+                  )
         else:
-            print('Not implemented!')
+            print(f'{message_size: <30} {num_iters: <25} '
+                  f'{elapsed_time * 1e6: <20} {bandwidth / 1e6: <20}'
+                  )
 
 def main(args):
     if len(args) < 7:

From 5fdfee3956708570b006bffab8c0942079f37f1c Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 30 Dec 2020 13:04:37 -0500
Subject: [PATCH 005/107] Stage script

---
 tests/benchmark/pingpong.py | 44 +++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/tests/benchmark/pingpong.py b/tests/benchmark/pingpong.py
index cb367f81..2c304456 100644
--- a/tests/benchmark/pingpong.py
+++ b/tests/benchmark/pingpong.py
@@ -9,35 +9,34 @@
 
 class Ping(Chare):
 
-    def __init__(self):
+    def __init__(self, gpu, num_iters):
+        self.gpu = gpu
         self.myIndex = self.thisIndex[0]
         if self.myIndex == 0:
             self.neighbor = self.thisProxy[1]
         else:
             self.neighbor = self.thisProxy[0]
 
-    def start(self, done_future, threaded=False, gpu=False):
-        assert threaded ^ gpu
+    def start(self, done_future, payload_size):
         self.done_future = done_future
         self.iter = 0
-        if not gpu:
-            data = np.zeros(PAYLOAD, dtype='int8')
-        else:
-            pass
-        # data = 3
-        self.startTime = time()
-        if threaded:
-            self.neighbor.recv_th(data)
+        data = np.zeros(payload_size, dtype='int8')
+        if self.gpu:
+            data = cuda.to_device(data)
+            self.startTime = time()
+
         else:
             self.neighbor.recv(data)
 
     def recv(self, data):
+        data = cuda.to_device(data)
         if self.myIndex == 0:
             self.iter += 1
             if self.iter == NITER:
                 totalTime = time() - self.startTime
                 self.done_future.send(totalTime)
                 return
+        data = data.copy_to_host()
         self.neighbor.recv(data)
 
     @coro
@@ -53,11 +52,24 @@ def recv_th(self, data):
 
 def main(args):
     threaded = False
-    if len(args) > 1 and args[1] == '-t':
-        threaded = True
-    elif len(args) >1 and args[1] == '--gpu':
-        gpu = True
-    pings = Array(Ping, 2)
+    gpu = False
+    min_msg_size, max_mig_size, low_iter, high_iter, printFormat, gpu = 0
+    if len(args) < 7:
+        print("Doesn't have the required input params. Usage:"
+              "<max-msg-size> <low-iter> <high-iter> <print-format"
+              "(0 for csv, 1 for "
+              "regular)> <GPU (0 for CPU, 1 for GPU)>\n"
+              )
+        charm.exit(-1)
+
+    min_msg_size = int(args[1])
+    max_msg_size = int(args[2])
+    low_iter = int(args[3])
+    high_iter = int(args[4])
+    print_format = int(args[5])
+    gpu = int(args[6])
+
+    pings = Array(Ping, 2, gpu)
     charm.awaitCreation(pings)
     for _ in range(2):
         done_future = Future()

From 54126345db3077e5e6dbe0c3bc113a7b55b320c7 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 30 Dec 2020 13:09:41 -0500
Subject: [PATCH 006/107] Reset changes to pingpong

---
 tests/benchmark/pingpong.py | 43 +++++++++++--------------------------
 1 file changed, 12 insertions(+), 31 deletions(-)

diff --git a/tests/benchmark/pingpong.py b/tests/benchmark/pingpong.py
index 2c304456..cb6060c2 100644
--- a/tests/benchmark/pingpong.py
+++ b/tests/benchmark/pingpong.py
@@ -1,7 +1,6 @@
 from charm4py import charm, Chare, Array, coro, Future
 from time import time
-import numpy as np
-from numba import cuda
+#import numpy as np
 
 PAYLOAD = 100  # number of bytes
 NITER = 10000
@@ -9,34 +8,31 @@
 
 class Ping(Chare):
 
-    def __init__(self, gpu, num_iters):
-        self.gpu = gpu
+    def __init__(self):
         self.myIndex = self.thisIndex[0]
         if self.myIndex == 0:
             self.neighbor = self.thisProxy[1]
         else:
             self.neighbor = self.thisProxy[0]
 
-    def start(self, done_future, payload_size):
+    def start(self, done_future, threaded=False):
         self.done_future = done_future
         self.iter = 0
-        data = np.zeros(payload_size, dtype='int8')
-        if self.gpu:
-            data = cuda.to_device(data)
-            self.startTime = time()
-
+        #data = np.zeros(PAYLOAD, dtype='int8')
+        data = 3
+        self.startTime = time()
+        if threaded:
+            self.neighbor.recv_th(data)
         else:
             self.neighbor.recv(data)
 
     def recv(self, data):
-        data = cuda.to_device(data)
         if self.myIndex == 0:
             self.iter += 1
             if self.iter == NITER:
                 totalTime = time() - self.startTime
                 self.done_future.send(totalTime)
                 return
-        data = data.copy_to_host()
         self.neighbor.recv(data)
 
     @coro
@@ -52,28 +48,13 @@ def recv_th(self, data):
 
 def main(args):
     threaded = False
-    gpu = False
-    min_msg_size, max_mig_size, low_iter, high_iter, printFormat, gpu = 0
-    if len(args) < 7:
-        print("Doesn't have the required input params. Usage:"
-              "<max-msg-size> <low-iter> <high-iter> <print-format"
-              "(0 for csv, 1 for "
-              "regular)> <GPU (0 for CPU, 1 for GPU)>\n"
-              )
-        charm.exit(-1)
-
-    min_msg_size = int(args[1])
-    max_msg_size = int(args[2])
-    low_iter = int(args[3])
-    high_iter = int(args[4])
-    print_format = int(args[5])
-    gpu = int(args[6])
-
-    pings = Array(Ping, 2, gpu)
+    if len(args) > 1 and args[1] == '-t':
+        threaded = True
+    pings = Array(Ping, 2)
     charm.awaitCreation(pings)
     for _ in range(2):
         done_future = Future()
-        pings[0].start(done_future, threaded, gpu)
+        pings[0].start(done_future, threaded)
         totalTime = done_future.get()
         print("ping pong time per iter (us)=", totalTime / NITER * 1000000)
     exit()

From 2d4b2c446b1806379f96e537efa5af5dd1df04f3 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 30 Dec 2020 13:18:22 -0500
Subject: [PATCH 007/107] Unify API between the CPU/GPU benchmarks

---
 tests/benchmark/pingpong.py | 137 +++++++++++++++++++++++-------------
 1 file changed, 88 insertions(+), 49 deletions(-)

diff --git a/tests/benchmark/pingpong.py b/tests/benchmark/pingpong.py
index cb6060c2..6c027fb9 100644
--- a/tests/benchmark/pingpong.py
+++ b/tests/benchmark/pingpong.py
@@ -1,63 +1,102 @@
-from charm4py import charm, Chare, Array, coro, Future
-from time import time
-#import numpy as np
+from charm4py import charm, Chare, Array, coro, Future, Channel, Group
+import time
+import numpy as np
 
-PAYLOAD = 100  # number of bytes
-NITER = 10000
+class Ping(Chare):
+    def __init__(self, use_zerocopy, print_format):
+        self.zero_copy = use_zerocopy
+        self.num_chares = charm.numPes()
+        self.print_format = print_format
+        self.am_low_chare = self.thisIndex == 0
 
+        if self.am_low_chare:
+            if print_format == 0:
+                print("Msg Size, Iterations, One-way Time (us), "
+                      "Bandwidth (bytes/us)"
+                      )
+            else:
+                print(f'{"Msg Size": <30} {"Iterations": <25} '
+                      f'{"One-way Time (us)": <20} {"Bandwidth (bytes/us)": <20}'
+                      )
 
-class Ping(Chare):
+    @coro
+    def do_iteration(self, message_size, num_iters, done_future):
+        data = np.zeros(message_size, dtype='int8')
+        partner_idx = int(not self.thisIndex)
+        partner = self.thisProxy[partner_idx]
+        partner_channel = Channel(self, partner)
 
-    def __init__(self):
-        self.myIndex = self.thisIndex[0]
-        if self.myIndex == 0:
-            self.neighbor = self.thisProxy[1]
-        else:
-            self.neighbor = self.thisProxy[0]
-
-    def start(self, done_future, threaded=False):
-        self.done_future = done_future
-        self.iter = 0
-        #data = np.zeros(PAYLOAD, dtype='int8')
-        data = 3
-        self.startTime = time()
-        if threaded:
-            self.neighbor.recv_th(data)
-        else:
-            self.neighbor.recv(data)
+        tstart = time.time()
 
-    def recv(self, data):
-        if self.myIndex == 0:
-            self.iter += 1
-            if self.iter == NITER:
-                totalTime = time() - self.startTime
-                self.done_future.send(totalTime)
-                return
-        self.neighbor.recv(data)
+        for _ in range(num_iters):
+            if self.am_low_chare:
+                if not self.zero_copy:
+                    partner_channel.send(data)
+                    partner_channel.recv()
+                else:
+                    raise NotImplementedError("TODO: ZeroCopy")
 
-    @coro
-    def recv_th(self, data):
-        if self.myIndex == 0:
-            self.iter += 1
-            if self.iter == NITER:
-                totalTime = time() - self.startTime
-                self.done_future.send(totalTime)
-                return
-        self.neighbor.recv_th(data)
+            else:
+                if not self.zero_copy:
+                    partner_channel.recv()
+                    partner_channel.send(data)
+                else:
+                    raise NotImplementedError("TODO: ZeroCopy")
+
+        tend = time.time()
+
+        elapsed_time = tend - tstart
 
+        if self.am_low_chare:
+            self.display_iteration_data(elapsed_time, num_iters, message_size)
+
+        self.reduce(done_future)
+
+    def display_iteration_data(self, elapsed_time, num_iters, message_size):
+        elapsed_time /= 2  # 1-way performance, not RTT
+        elapsed_time /= num_iters  # Time for each message
+        bandwidth = message_size / elapsed_time
+        if self.print_format == 0:
+            print(f'{message_size},{num_iters},{elapsed_time * 1e6},'
+                  f'{bandwidth / 1e6}'
+                  )
+        else:
+            print(f'{message_size: <30} {num_iters: <25} '
+                  f'{elapsed_time * 1e6: <20} {bandwidth / 1e6: <20}'
+                  )
 
 def main(args):
-    threaded = False
-    if len(args) > 1 and args[1] == '-t':
-        threaded = True
-    pings = Array(Ping, 2)
+    if len(args) < 7:
+        print("Doesn't have the required input params. Usage:"
+              "<min-msg-size> <max-msg-size> <low-iter> "
+              "<high-iter> <print-format"
+              "(0 for csv, 1 for "
+              "regular)> <GPU (0 for CPU, 1 for GPU)>\n"
+              )
+        charm.exit(-1)
+
+    min_msg_size = int(args[1])
+    max_msg_size = int(args[2])
+    low_iter = int(args[3])
+    high_iter = int(args[4])
+    print_format = int(args[5])
+    use_zerocopy = int(args[6])
+
+    pings = Group(Ping, args=[use_zerocopy, print_format])
     charm.awaitCreation(pings)
-    for _ in range(2):
+    msg_size = min_msg_size
+
+    while msg_size <= max_msg_size:
+        if msg_size <= 1048576:
+            iter = low_iter
+        else:
+            iter = high_iter
         done_future = Future()
-        pings[0].start(done_future, threaded)
-        totalTime = done_future.get()
-        print("ping pong time per iter (us)=", totalTime / NITER * 1000000)
-    exit()
+        pings.do_iteration(msg_size, iter, done_future)
+        done_future.get()
+        msg_size *= 2
+
+    charm.exit()
 
 
 charm.start(main)

From 76c50c706c28e75ac1dee6e3c67c712fbec87f3f Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 4 Jan 2021 11:05:48 -0600
Subject: [PATCH 008/107] Add macro test

---
 charm4py/charmlib/ccharm.pxd | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index 5bbe1b05..12111a1a 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -13,6 +13,11 @@ cdef extern from "charm.h":
     void LBTurnInstrumentOn();
     void LBTurnInstrumentOff();
 
+    """
+    #if CMK_CHARM4PY
+    #warn "CMK_CHARM4PY macro access within Cython~~~~~\n\n\n\n\n\n\n"
+    """ 
+
     int CmiPeOnSamePhysicalNode(int pe1, int pe2);
     int CmiNumPhysicalNodes();
     int CmiPhysicalNodeID(int pe);

From ba4a9f17e34c7414936b71721ebb081893828b6a Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 6 Jan 2021 12:33:38 -0500
Subject: [PATCH 009/107] add methods for Charm++ CUDA interface

---
 charm4py/charmlib/ccharm.pxd          | 7 ++-----
 charm4py/charmlib/charmlib_cython.pyx | 3 +++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index 12111a1a..0da6c355 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -13,11 +13,6 @@ cdef extern from "charm.h":
     void LBTurnInstrumentOn();
     void LBTurnInstrumentOff();
 
-    """
-    #if CMK_CHARM4PY
-    #warn "CMK_CHARM4PY macro access within Cython~~~~~\n\n\n\n\n\n\n"
-    """ 
-
     int CmiPeOnSamePhysicalNode(int pe1, int pe2);
     int CmiNumPhysicalNodes();
     int CmiPhysicalNodeID(int pe);
@@ -75,6 +70,8 @@ cdef extern from "charm.h":
     void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep);
     void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs);
 
+    int CkCudaEnabled();
+    int CUDAPointerOnDevice(const void *ptr);
 
 cdef extern from "spanningTree.h":
     void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor,
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index aad5f323..7ed31051 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -506,6 +506,9 @@ class CharmLib(object):
     CkRegisterArrayExt(self.chareNames[-1], numEntryMethods, &chareIdx, &startEpIdx)
     return chareIdx, startEpIdx
 
+  def CUDAPointerOnDevice(self, long address):
+    return CUDAPointerOnDevice(<const void*>address)
+
   def CkCreateGroup(self, int chareIdx, int epIdx, msg not None):
     global cur_buf
     msg0, dcopy = msg

From 3f3467133a216343298e45b565232b7b69352c87 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sat, 9 Jan 2021 09:25:48 -0500
Subject: [PATCH 010/107] sender-side GPU direct

---
 charm4py/chare.py                     | 20 +++++++++--
 charm4py/charmlib/ccharm.pxd          |  9 +++++
 charm4py/charmlib/charmlib_cython.pyx | 52 +++++++++++++++++++++++++--
 3 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/charm4py/chare.py b/charm4py/chare.py
index a1447eb4..2e54b766 100644
--- a/charm4py/chare.py
+++ b/charm4py/chare.py
@@ -721,7 +721,10 @@ def proxy_entry_method(proxy, *args, **kwargs):
             for i in range(num_args, argcount):
                 argname = argnames[i]
                 # first look for argument in kwargs
-                if argname in kwargs:
+                # TODO: Should stream_ptrs be skipped?
+                if argname == 'stream_ptrs':
+                    continue
+                if argname in kwargs and argname:
                     args.append(kwargs[argname])
                 else:
                     # if not there, see if there is a default value
@@ -741,15 +744,26 @@ def proxy_entry_method(proxy, *args, **kwargs):
             if elemIdx == ():
                 header[b'bcast'] = True
         if not proxy.issec or elemIdx != ():
+            # TODO: Check that this is channel proxy method?
             destObj = None
             aid = proxy.aid
             if Options.local_msg_optim and (len(args) > 0):
                 array = charm.arrays[aid]
                 if elemIdx in array:
                     destObj = array[elemIdx]
-            msg = charm.packMsg(destObj, args, header)
-            charm.CkArraySend(aid, elemIdx, ep, msg)
+            msg, has_gpu_data = charm.packMsg(destObj, args, header)
+            if has_gpu_data:
+                if 'stream_ptrs' in kwargs and kwargs['stream_ptrs']:
+                    stream_ptrs = kwargs['stream_ptrs']
+                else:
+                    stream_ptrs = None
+                charm.CkArraySendWithDeviceData(aid, elemIdx, ep,
+                                                msg, stream_ptrs
+                                                )
+            else:
+                charm.CkArraySend(aid, elemIdx, ep, msg)
         else:
+            # TODO: Error if trying to send ZC data
             root, sid = proxy.section
             header[b'sid'] = sid
             if Options.local_msg_optim and root == charm._myPe:
diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index 0da6c355..debcd537 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -72,6 +72,15 @@ cdef extern from "charm.h":
 
     int CkCudaEnabled();
     int CUDAPointerOnDevice(const void *ptr);
+    void CkChareExtSendWithDeviceData(int aid, int *idx, int ndims,
+                                  int epIdx, int num_bufs, char *msg,
+                                  int msgSize,
+                                  void *devBufPtrs,
+                                  void *devBufSizesInBytes,
+                                  void *streamPtrs, int numDevBufs
+                                  );
+
+
 
 cdef extern from "spanningTree.h":
     void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor,
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 7ed31051..0b5882f6 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -1,6 +1,6 @@
 from ccharm cimport *
 from libc.stdlib cimport malloc, free
-from libc.string cimport memcpy
+from libc.string cimport memcpy, memset
 from libc.stdint cimport uintptr_t
 from cpython.version cimport PY_MAJOR_VERSION
 from cpython.buffer  cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_ANY_CONTIGUOUS, PyBUF_SIMPLE
@@ -292,12 +292,16 @@ cdef extern const char * const CmiCommitID
 cdef (char*)[NUM_DCOPY_BUFS] send_bufs  # ?TODO bounds checking is needed where this is used
 cdef int[NUM_DCOPY_BUFS] send_buf_sizes # ?TODO bounds checking is needed where this is used
 cdef int cur_buf = 1
+cdef int gpu_direct_buf_idx = 0
 cdef int[MAX_INDEX_LEN] c_index
 cdef Py_buffer send_buffer
 cdef ReceiveMsgBuffer recv_buffer = ReceiveMsgBuffer()
 cdef c_type_table_typecodes = [None] * 13
 cdef int c_type_table_sizes[13]
 cdef int[SECTION_MAX_BFACTOR] section_children
+cdef long[NUM_DCOPY_BUFS] gpu_direct_device_ptrs
+cdef long[NUM_DCOPY_BUFS] gpu_direct_buff_sizes
+cdef long[NUM_DCOPY_BUFS] gpu_direct_stream_ptrs
 
 cdef object charm
 cdef object charm_reducer_to_ctype
@@ -449,6 +453,35 @@ class CharmLib(object):
       CkGroupExtSend_multi(group_id, num_pes, section_children, ep, cur_buf, send_bufs, send_buf_sizes)
       cur_buf = 1
 
+  def CkArraySendWithDeviceData(self, int array_id, index not None, int ep,
+                                msg not None, list stream_ptrs):
+
+    global gpu_direct_buf_idx
+    cdef int i = 0
+    cdef int ndims = len(index)
+    assert ndims == 1
+    c_index[0] = index[0]
+    msg0, dcopy = msg
+    cdef int num_direct_buffers = gpu_direct_buf_idx + 1
+    # TODO: Message on assertion failure
+    assert num_direct_buffers <= NUM_DCOPY_BUFS
+    global gpu_direct_device_ptrs
+    global gpu_direct_stream_ptrs
+
+    if stream_ptrs:
+      for i in range(num_direct_buffers):
+        gpu_direct_stream_ptrs[i] = stream_ptrs[i]
+    else:
+      memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers)
+
+    CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0),
+                                 <void*> gpu_direct_device_ptrs,
+                                 <void*> gpu_direct_buff_sizes,
+                                 <void*> gpu_direct_stream_ptrs,
+                                 num_direct_buffers
+                                 )
+    gpu_direct_buf_idx = 0
+
   def CkArraySend(self, int array_id, index not None, int ep, msg not None):
     global cur_buf
     msg0, dcopy = msg
@@ -787,8 +820,12 @@ class CharmLib(object):
     else:
       direct_copy_hdr = []  # goes to header
       args = list(msgArgs)
+      msg_has_gpu_args = False
       global cur_buf
+      global gpu_direct_buf_idx
+      global gpu_direct_device_ptrs
       cur_buf = 1
+      gpu_direct_buf_idx = 0
       for i in range(len(args)):
         arg = msgArgs[i]
         if isinstance(arg, np.ndarray) and not arg.dtype.hasobject:
@@ -806,6 +843,15 @@ class CharmLib(object):
           nbytes = len(a) * a.itemsize # NOTE that cython's array C interface doesn't expose itemsize attribute
           direct_copy_hdr.append((i, 1, (a.typecode), nbytes))
           send_bufs[cur_buf] = <char*>a.data.as_voidptr
+        elif CkCudaEnabled() and hasattr(arg, '__cuda_array_interface__'):
+          # we want to take the args that implement the cuda array interface and make them into ckdevicebuffers
+          # assumption: we can get nbytes from the arg directly
+          # TODO: verify this assertion for other types
+          gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg['__cuda_array_interface__']['data'][0]
+          gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes
+          cuda_dev_info = True
+          gpu_direct_buf_idx += 1
+          continue
         else:
           continue
         args[i] = None  # will direct-copy this arg so remove from args list
@@ -817,10 +863,12 @@ class CharmLib(object):
         msg = dumps((header, args), PICKLE_PROTOCOL)
       except:
         global cur_buf
+        global gpu_direct_buf_idx
         cur_buf = 1
+        gpu_direct_buf_idx = 0
         raise
     if PROFILING: charm.recordSend(len(msg) + dcopy_size)
-    return msg, None
+    return msg, cuda_dev_info
 
   def scheduleTagAfter(self, int tag, double msecs):
     CcdCallFnAfter(CcdCallFnAfterCallback, <void*>tag, msecs)

From 4048015d5ee6132aec69ac494ba6184f54f59944 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 11 Jan 2021 19:12:34 -0500
Subject: [PATCH 011/107] add method to get GPU data

---
 charm4py/channel.py |  9 ++++++++-
 charm4py/charm.py   | 23 ++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index aaf058ec..c6748ee7 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -1,4 +1,5 @@
 from .threads import LocalFuture
+from .charm4py import getGPUDirectData
 
 
 class Channel(object):
@@ -60,7 +61,7 @@ def send(self, *msg):
         self.remote._channelRecv__(self.remote_port, self.send_seqno, *msg)
         self.send_seqno = (self.send_seqno + 1) % CHAN_BUF_SIZE
 
-    def recv(self):
+    def recv(self, *post_buffers, stream_ptrs = None:
         if self.recv_seqno in self.data:
             ret = self.data.pop(self.recv_seqno)
         else:
@@ -68,4 +69,10 @@ def recv(self):
             ret = self.recv_fut.get()
             self.recv_fut = None
         self.recv_seqno = (self.recv_seqno + 1) % CHAN_BUF_SIZE
+
+        if post_buffers:
+            gpu_recv_bufs = ret.pop()
+            assert len(post_buffers) == len(gpu_recv_bufs)
+            recv_future = getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
+            recv_future.get()
         return ret
diff --git a/charm4py/charm.py b/charm4py/charm.py
index a13935fe..bc9b8d9d 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -123,6 +123,12 @@ def __init__(self):
         self.reducers = reduction.ReducerContainer(self)
         self.redMgr = reduction.ReductionManager(self, self.reducers)
         self.mainchareRegistered = False
+        # TODO: create a 'charm' CUDA interface
+        try:
+            from numba import cuda as numba_cuda
+            self.CUDA = numba_cuda
+        except ImportError:
+            raise Charm4PyError("Currently numba is required to use Charm4Py (temporary)")
         # entry point to Charm program. can be used in place of defining a Mainchare
         self.entry_func = None
         if self.lib.name == 'cython':
@@ -305,6 +311,14 @@ def recvArrayMsg(self, aid, index, ep, msg, dcopy_start):
                 self.arrays[aid][index] = obj
                 em.run(obj, header, args)  # now call the user's array element __init__
 
+    def recvGPUDirectMsg(self, aid, index, ep,
+                         devBuf_ptrs, msg, dcopy_start
+                         ):
+        obj = self.arrays[aid][index]
+        header, args = self.unpackMsg(msg, dcopy_start, obj)
+        args.append(devBuf_ptrs)
+        self.invokeEntryMethod(obj, ep, header, args)
+
     def recvArrayBcast(self, aid, indexes, ep, msg, dcopy_start):
         header, args = self.unpackMsg(msg, dcopy_start, None)
         array = self.arrays[aid]
@@ -312,6 +326,7 @@ def recvArrayBcast(self, aid, indexes, ep, msg, dcopy_start):
             self.invokeEntryMethod(array[index], ep, header, args)
 
     def unpackMsg(self, msg, dcopy_start, dest_obj):
+        # Issue Rgets for GPU data in unpackMsg? But how does recv work?
         if msg[:7] == b'_local:':
             header, args = dest_obj.__removeLocal__(int(msg[7:]))
         else:
@@ -332,6 +347,13 @@ def unpackMsg(self, msg, dcopy_start, dest_obj):
 
         return header, args
 
+    def getGPUDirectData(self, post_buffers, gpu_recv_bufs, stream_ptrs):
+        return_fut = self.Future()
+        if not streams:
+            stream_ptrs = [0] * len(post_buffers)
+        self.lib.getGPUDirectData(return_fut, post_buffers, gpu_recv_bufs, stream_ptrs)
+        return return_fut
+
     def packMsg(self, destObj, msgArgs, header):
         """Prepares a message for sending, given arguments to an entry method invocation.
 
@@ -1156,6 +1178,5 @@ def rebuildNumpyArray(data, shape, dt):
     a.shape = shape
     return a.copy()
 
-
 charm = Charm()
 readonlies = __ReadOnlies()

From fc535bbcc53548742c2cb50ac9fa3e5ee9ccaefe Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 11 Jan 2021 19:13:25 -0500
Subject: [PATCH 012/107] add methods to support receiver-side GPU Direct

---
 charm4py/charmlib/ccharm.pxd          |  8 +++++
 charm4py/charmlib/charmlib_cython.pyx | 43 ++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index debcd537..2ec0dafb 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -70,6 +70,7 @@ cdef extern from "charm.h":
     void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep);
     void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs);
 
+    # TODO: Organize these to place them near their related functions
     int CkCudaEnabled();
     int CUDAPointerOnDevice(const void *ptr);
     void CkChareExtSendWithDeviceData(int aid, int *idx, int ndims,
@@ -80,9 +81,16 @@ cdef extern from "charm.h":
                                   void *streamPtrs, int numDevBufs
                                   );
 
+    void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int));
+
 
 
 cdef extern from "spanningTree.h":
     void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor,
                             int *parent, int *child_count, int **children);
 
+
+# cdef extern from "ckrdmadevice.h":
+#     cdef cppclass CkDeviceBuffer:
+#         CkDeviceBuffer()
+
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 0b5882f6..af1e4a43 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -285,9 +285,18 @@ cdef inline object array_index_to_tuple(int ndims, int *arrayIndex):
       PyTuple_SET_ITEM(arrIndex, i, d)
   return arrIndex
 
-
 cdef extern const char * const CmiCommitID
 
+# cdef class PyCkDeviceBuffer:
+#     cdef CkDeviceBuffer c_buff
+
+#     @staticmethod
+#     cdef PyCkDeviceBuffer from_ptr(CkDeviceBuffer buf):
+#         cdef PyCkDeviceBuffer newBuf = PyCkDeviceBuffer.__new__(PyCkDeviceBuffer)
+#         newBuf.c_buff = buf
+#         return newBuf
+
+
 # supports up to NUM_DCOPY_BUFS direct-copy entry method arguments
 cdef (char*)[NUM_DCOPY_BUFS] send_bufs  # ?TODO bounds checking is needed where this is used
 cdef int[NUM_DCOPY_BUFS] send_buf_sizes # ?TODO bounds checking is needed where this is used
@@ -739,6 +748,7 @@ class CharmLib(object):
     registerChareMsgRecvExtCallback(recvChareMsg)
     registerGroupMsgRecvExtCallback(recvGroupMsg)
     registerArrayMsgRecvExtCallback(recvArrayMsg)
+    registerArrayMsgGPUDirectRecvExtCallback(recvGPUDirectMsg)
     registerArrayBcastRecvExtCallback(recvArrayBcast)
     registerArrayMapProcNumExtCallback(arrayMapProcNum)
     registerArrayElemJoinExtCallback(arrayElemJoin)
@@ -873,6 +883,17 @@ class CharmLib(object):
   def scheduleTagAfter(self, int tag, double msecs):
     CcdCallFnAfter(CcdCallFnAfterCallback, <void*>tag, msecs)
 
+  def getGPUDirectData(self, list postbuf_ptrs, list gpu_recv_bufs, list stream_ptrs):
+    cdef int num_buffers = len(gpu_recv_bufs)
+    cdef int[num_buffers] gpu_buf_sizes
+    cdef (int*)[num_buffers] gpu_buf_ptrs
+    cdef int[num_buffers] stream_ptrs
+
+    for idx in range(num_buffers):
+      gpu_buf_sizes[idx] = gpu_recv_bufs[idx][0]
+      gpu_buf_ptrs[idx] = gpu_recv_bufs[idx][1]
+      stream_ptrs[idx] = streams_ptrs[idx]
+    CkGetGPUDirectData()
 
 # first callback from Charm++ shared library
 cdef void registerMainModule():
@@ -925,6 +946,25 @@ cdef void recvArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int msgSize,
   except:
     charm.handleGeneralError()
 
+cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numDevBuffs,
+                           long *devBufSizes, void *devBufs, int msgSize,
+                           char *msg, int dcopy_start):
+    cdef int idx = 0
+    cdef void *bptr
+    try:
+      if PROFILING:
+        charm._precvtime = time.time()
+        charm.recordReceive(msgSize)
+      devBufInfo = []
+      for idx in range(numDevBuffs):
+        dev_buf = devBufs[idx]
+        devBufInfo.append((devBufSizes[idx], dev_buf))
+      recv_buffer.setMsg(msg, msgSize)
+      charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, recv_buffer, mg, dcopy_start)
+    except:
+      charm.handleGeneralError()
+
+
 cdef void recvArrayBcast(int aid, int ndims, int nInts, int numElems, int *arrayIndexes, int ep, int msgSize, char *msg, int dcopy_start):
   cdef int i = 0
   try:
@@ -1107,3 +1147,4 @@ cdef void CcdCallFnAfterCallback(void *userParam, double curWallTime):
     charm.triggerCallable(<int>userParam)
   except:
     charm.handleGeneralError()
+

From 67ed75bd2afb52eb39780b206b91fd6c3751a2d5 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Fri, 15 Jan 2021 16:07:22 -0500
Subject: [PATCH 013/107] hook up to Charm++ GPUDirect functionality

---
 charm4py/charmlib/ccharm.pxd          |  1 +
 charm4py/charmlib/charmlib_cython.pyx | 35 +++++++++++++++++++--------
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index 2ec0dafb..bb8c1c3a 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -82,6 +82,7 @@ cdef extern from "charm.h":
                                   );
 
     void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int));
+    void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes, void *remoteBufInfo, void *streamPtrs);
 
 
 
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index af1e4a43..7c45f856 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -883,17 +883,32 @@ class CharmLib(object):
   def scheduleTagAfter(self, int tag, double msecs):
     CcdCallFnAfter(CcdCallFnAfterCallback, <void*>tag, msecs)
 
-  def getGPUDirectData(self, list postbuf_ptrs, list gpu_recv_bufs, list stream_ptrs):
-    cdef int num_buffers = len(gpu_recv_bufs)
-    cdef int[num_buffers] gpu_buf_sizes
-    cdef (int*)[num_buffers] gpu_buf_ptrs
-    cdef int[num_buffers] stream_ptrs
+  def getGPUDirectData(self, list post_buf_data, list remote_bufs, list stream_ptrs, return_fut):
+    cdef int num_buffers = len(post_buf_data)
+    cdef array.array int_array_template = array.array('i', [])
+    cdef array.array long_array_template = array.array('L', [])
+    cdef array.array recv_buf_sizes
+    cdef array.array recv_buf_ptrs
+    # pointers from the remote that we will be issuing Rgets for
+    # these are pointers to type CkDeviceBuffer
+    cdef array.array remote_buf_ptrs
+
+    recv_buf_sizes = array.clone(int_array_template, num_buffers, zero=False)
+    recv_buf_ptrs = array.clone(long_array_template, num_buffers, zero=False)
+    stream_ptrs_forc = array.clone(long_array_template, num_buffers, zero=False)
+    remote_buf_ptrs = array.clone(long_array_template, num_buffers, zero=False)
 
     for idx in range(num_buffers):
-      gpu_buf_sizes[idx] = gpu_recv_bufs[idx][0]
-      gpu_buf_ptrs[idx] = gpu_recv_bufs[idx][1]
-      stream_ptrs[idx] = streams_ptrs[idx]
-    CkGetGPUDirectData()
+      recv_buf_ptrs[idx] = post_buf_data[idx][0]
+      recv_buf_sizes[idx] = remote_bufs[idx][0]
+      remote_buf_ptrs = remote_bufs[idx][1]
+      stream_ptrs_forc[idx] = stream_ptrs[idx]
+      # what do we do about the return future? Need to turn it into some callback.
+    CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr,
+                       <int*> recv_buf_sizes.data.as_voidptr,
+                       remote_buf_ptrs.data.as_voidptr,
+                       stream_ptrs_forc.data.as_voidptr
+                       )
 
 # first callback from Charm++ shared library
 cdef void registerMainModule():
@@ -960,7 +975,7 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD
         dev_buf = devBufs[idx]
         devBufInfo.append((devBufSizes[idx], dev_buf))
       recv_buffer.setMsg(msg, msgSize)
-      charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, recv_buffer, mg, dcopy_start)
+      charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, recv_buffer, msg, dcopy_start)
     except:
       charm.handleGeneralError()
 

From 0f8bf705afe8138e522af92e5582a30679ce8c1a Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Fri, 15 Jan 2021 16:08:11 -0500
Subject: [PATCH 014/107] hooks into lib

---
 charm4py/charm.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index bc9b8d9d..f4da7506 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -40,6 +40,11 @@ class NumpyDummy:
 def SECTION_ALL(obj):
     return 0
 
+def getDeviceDataInfo(devArray):
+    return devArray['__cuda_array_interface__']['data']
+
+def getDeviceDataAddress(devArray):
+    return getDeviceDataInfo(devArray[0])
 
 class Options(object):
 
@@ -123,12 +128,6 @@ def __init__(self):
         self.reducers = reduction.ReducerContainer(self)
         self.redMgr = reduction.ReductionManager(self, self.reducers)
         self.mainchareRegistered = False
-        # TODO: create a 'charm' CUDA interface
-        try:
-            from numba import cuda as numba_cuda
-            self.CUDA = numba_cuda
-        except ImportError:
-            raise Charm4PyError("Currently numba is required to use Charm4Py (temporary)")
         # entry point to Charm program. can be used in place of defining a Mainchare
         self.entry_func = None
         if self.lib.name == 'cython':
@@ -347,11 +346,12 @@ def unpackMsg(self, msg, dcopy_start, dest_obj):
 
         return header, args
 
-    def getGPUDirectData(self, post_buffers, gpu_recv_bufs, stream_ptrs):
+    def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs):
         return_fut = self.Future()
+        post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers]
         if not streams:
             stream_ptrs = [0] * len(post_buffers)
-        self.lib.getGPUDirectData(return_fut, post_buffers, gpu_recv_bufs, stream_ptrs)
+        self.lib.getGPUDirectData(post_buf_data, remote_bufs, stream_ptrs, return_fut)
         return return_fut
 
     def packMsg(self, destObj, msgArgs, header):

From 18b4e136209ef0d8847c767635e9131ccaae8d5c Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Fri, 15 Jan 2021 16:08:32 -0500
Subject: [PATCH 015/107] fix syntax error

---
 charm4py/channel.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index c6748ee7..e6824ecb 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -1,5 +1,5 @@
 from .threads import LocalFuture
-from .charm4py import getGPUDirectData
+from .charm import Charm
 
 
 class Channel(object):
@@ -61,7 +61,7 @@ def send(self, *msg):
         self.remote._channelRecv__(self.remote_port, self.send_seqno, *msg)
         self.send_seqno = (self.send_seqno + 1) % CHAN_BUF_SIZE
 
-    def recv(self, *post_buffers, stream_ptrs = None:
+    def recv(self, *post_buffers, stream_ptrs = None):
         if self.recv_seqno in self.data:
             ret = self.data.pop(self.recv_seqno)
         else:
@@ -73,6 +73,6 @@ def recv(self, *post_buffers, stream_ptrs = None:
         if post_buffers:
             gpu_recv_bufs = ret.pop()
             assert len(post_buffers) == len(gpu_recv_bufs)
-            recv_future = getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
+            Charm.recv_future = getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
             recv_future.get()
         return ret

From 22297d27e3d41f28239e5296714f69685482d76c Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sun, 17 Jan 2021 10:25:31 -0500
Subject: [PATCH 016/107] fix libcharm call

---
 charm4py/charm.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index f4da7506..6bb3f28a 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -41,10 +41,10 @@ def SECTION_ALL(obj):
     return 0
 
 def getDeviceDataInfo(devArray):
-    return devArray['__cuda_array_interface__']['data']
+    return devArray.__cuda_array_interface__['data']
 
 def getDeviceDataAddress(devArray):
-    return getDeviceDataInfo(devArray[0])
+    return getDeviceDataInfo(devArray)[0]
 
 class Options(object):
 
@@ -125,6 +125,7 @@ def __init__(self):
         self.CkChareSend = self.lib.CkChareSend
         self.CkGroupSend = self.lib.CkGroupSend
         self.CkArraySend = self.lib.CkArraySend
+        self.CkArraySendWithDeviceData = self.lib.CkArraySendWithDeviceData
         self.reducers = reduction.ReducerContainer(self)
         self.redMgr = reduction.ReductionManager(self, self.reducers)
         self.mainchareRegistered = False
@@ -347,9 +348,9 @@ def unpackMsg(self, msg, dcopy_start, dest_obj):
         return header, args
 
     def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs):
-        return_fut = self.Future()
+        return_fut = self.Future(len(post_buffers))
         post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers]
-        if not streams:
+        if not stream_ptrs:
             stream_ptrs = [0] * len(post_buffers)
         self.lib.getGPUDirectData(post_buf_data, remote_bufs, stream_ptrs, return_fut)
         return return_fut

From 0b316d6fdd602326122a960d95486b6673f87ff5 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sun, 17 Jan 2021 10:25:48 -0500
Subject: [PATCH 017/107] gpu_recv_bufs now correctly retrieved

---
 charm4py/channel.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index e6824ecb..c7883a7f 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -1,5 +1,5 @@
 from .threads import LocalFuture
-from .charm import Charm
+from .charm import charm
 
 
 class Channel(object):
@@ -71,8 +71,15 @@ def recv(self, *post_buffers, stream_ptrs = None):
         self.recv_seqno = (self.recv_seqno + 1) % CHAN_BUF_SIZE
 
         if post_buffers:
-            gpu_recv_bufs = ret.pop()
+            gpu_recv_bufs = ret[-1]
+            ret = ret[:-1:1]
             assert len(post_buffers) == len(gpu_recv_bufs)
-            Charm.recv_future = getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
+            recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
             recv_future.get()
         return ret
+
+
+
+
+
+

From 01a31d04d5ab3413497b4d6bb4299d9698500728 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sun, 17 Jan 2021 10:26:26 -0500
Subject: [PATCH 018/107] fix incorrect check for CUDA data

---
 charm4py/chare.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/charm4py/chare.py b/charm4py/chare.py
index 2e54b766..58ab2c06 100644
--- a/charm4py/chare.py
+++ b/charm4py/chare.py
@@ -751,8 +751,8 @@ def proxy_entry_method(proxy, *args, **kwargs):
                 array = charm.arrays[aid]
                 if elemIdx in array:
                     destObj = array[elemIdx]
-            msg, has_gpu_data = charm.packMsg(destObj, args, header)
-            if has_gpu_data:
+            msg = charm.packMsg(destObj, args, header)
+            if msg[1]:
                 if 'stream_ptrs' in kwargs and kwargs['stream_ptrs']:
                     stream_ptrs = kwargs['stream_ptrs']
                 else:

From 683a1c0ac43dbb429848d5997a6c4449a45a01e0 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sun, 17 Jan 2021 10:26:42 -0500
Subject: [PATCH 019/107] add more API calls

---
 charm4py/charmlib/ccharm.pxd | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index bb8c1c3a..7f2a34b2 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -76,13 +76,18 @@ cdef extern from "charm.h":
     void CkChareExtSendWithDeviceData(int aid, int *idx, int ndims,
                                   int epIdx, int num_bufs, char *msg,
                                   int msgSize,
-                                  void *devBufPtrs,
-                                  void *devBufSizesInBytes,
-                                  void *streamPtrs, int numDevBufs
+                                  long *devBufPtrs,
+                                  long *devBufSizesInBytes,
+                                  long *streamPtrs, int numDevBufs
                                   );
 
     void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int));
-    void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes, void *remoteBufInfo, void *streamPtrs);
+    void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes,
+                            void *remoteBufInfo, void *streamPtrs, int *futureId);
+
+    int CkDeviceBufferSizeInBytes();
+
+    void registerDepositFutureWithIdFn(void (*cb)(void*, void*));
 
 
 

From 91a1737560d677d4a35021fed4d92b907590222f Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sun, 17 Jan 2021 10:28:24 -0500
Subject: [PATCH 020/107] add call to register future deposit, add size of ck
 device buffer in bytes

---
 charm4py/charmlib/charmlib_cython.pyx | 62 ++++++++++++++++-----------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 7c45f856..f62be726 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -30,7 +30,7 @@ ELSE:
   np = NumpyDummyModule()
 
 cdef object np_number = np.number
-
+cdef int CK_DEVICEBUFFER_SIZE_IN_BYTES = CkDeviceBufferSizeInBytes()
 
 # ------ global constants ------
 
@@ -468,10 +468,11 @@ class CharmLib(object):
     global gpu_direct_buf_idx
     cdef int i = 0
     cdef int ndims = len(index)
-    assert ndims == 1
-    c_index[0] = index[0]
+    # assert ndims == 1
+    for i in range(ndims): c_index[i] = index[i]
     msg0, dcopy = msg
-    cdef int num_direct_buffers = gpu_direct_buf_idx + 1
+    dcopy = None
+    cdef int num_direct_buffers = gpu_direct_buf_idx
     # TODO: Message on assertion failure
     assert num_direct_buffers <= NUM_DCOPY_BUFS
     global gpu_direct_device_ptrs
@@ -484,9 +485,9 @@ class CharmLib(object):
       memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers)
 
     CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0),
-                                 <void*> gpu_direct_device_ptrs,
-                                 <void*> gpu_direct_buff_sizes,
-                                 <void*> gpu_direct_stream_ptrs,
+                                 gpu_direct_device_ptrs,
+                                 gpu_direct_buff_sizes,
+                                 gpu_direct_stream_ptrs,
                                  num_direct_buffers
                                  )
     gpu_direct_buf_idx = 0
@@ -754,6 +755,7 @@ class CharmLib(object):
     registerArrayElemJoinExtCallback(arrayElemJoin)
     registerPyReductionExtCallback(pyReduction)
     registerCreateCallbackMsgExtCallback(createCallbackMsg)
+    registerDepositFutureWithIdFn(depositFutureWithId);
 
   def CkMyPe(self): return CkMyPeHook()
   def CkNumPes(self): return CkNumPesHook()
@@ -820,6 +822,7 @@ class CharmLib(object):
     cdef array.array a
     IF HAVE_NUMPY:
       cdef np.ndarray np_array
+    cuda_dev_info = None
     dcopy_size = 0
     if destObj is not None: # if dest obj is local
       localTag = destObj.__addLocal__((header, msgArgs))
@@ -830,7 +833,6 @@ class CharmLib(object):
     else:
       direct_copy_hdr = []  # goes to header
       args = list(msgArgs)
-      msg_has_gpu_args = False
       global cur_buf
       global gpu_direct_buf_idx
       global gpu_direct_device_ptrs
@@ -838,7 +840,17 @@ class CharmLib(object):
       gpu_direct_buf_idx = 0
       for i in range(len(args)):
         arg = msgArgs[i]
-        if isinstance(arg, np.ndarray) and not arg.dtype.hasobject:
+        if CkCudaEnabled() and hasattr(arg, '__cuda_array_interface__'):
+          # we want to take the args that implement the cuda array interface and make them into ckdevicebuffers
+          # assumption: we can get nbytes from the arg directly
+          # TODO: verify this assertion for other types
+          gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg.__cuda_array_interface__['data'][0]
+          gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes
+          cuda_dev_info = True
+          gpu_direct_buf_idx += 1
+          args[i] = None  # TODO: should this be done?
+          continue
+        elif isinstance(arg, np.ndarray) and not arg.dtype.hasobject:
           np_array = arg
           nbytes = np_array.nbytes
           direct_copy_hdr.append((i, 2, (arg.shape, np_array.dtype.name), nbytes))
@@ -853,15 +865,6 @@ class CharmLib(object):
           nbytes = len(a) * a.itemsize # NOTE that cython's array C interface doesn't expose itemsize attribute
           direct_copy_hdr.append((i, 1, (a.typecode), nbytes))
           send_bufs[cur_buf] = <char*>a.data.as_voidptr
-        elif CkCudaEnabled() and hasattr(arg, '__cuda_array_interface__'):
-          # we want to take the args that implement the cuda array interface and make them into ckdevicebuffers
-          # assumption: we can get nbytes from the arg directly
-          # TODO: verify this assertion for other types
-          gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg['__cuda_array_interface__']['data'][0]
-          gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes
-          cuda_dev_info = True
-          gpu_direct_buf_idx += 1
-          continue
         else:
           continue
         args[i] = None  # will direct-copy this arg so remove from args list
@@ -885,6 +888,7 @@ class CharmLib(object):
 
   def getGPUDirectData(self, list post_buf_data, list remote_bufs, list stream_ptrs, return_fut):
     cdef int num_buffers = len(post_buf_data)
+    cdef int *future_id = <int*> malloc(sizeof(int))
     cdef array.array int_array_template = array.array('i', [])
     cdef array.array long_array_template = array.array('L', [])
     cdef array.array recv_buf_sizes
@@ -899,15 +903,16 @@ class CharmLib(object):
     remote_buf_ptrs = array.clone(long_array_template, num_buffers, zero=False)
 
     for idx in range(num_buffers):
-      recv_buf_ptrs[idx] = post_buf_data[idx][0]
+      recv_buf_ptrs[idx] = post_buf_data[idx]
       recv_buf_sizes[idx] = remote_bufs[idx][0]
-      remote_buf_ptrs = remote_bufs[idx][1]
+      remote_buf_ptrs[idx] = remote_bufs[idx][1]
       stream_ptrs_forc[idx] = stream_ptrs[idx]
       # what do we do about the return future? Need to turn it into some callback.
     CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr,
                        <int*> recv_buf_sizes.data.as_voidptr,
                        remote_buf_ptrs.data.as_voidptr,
-                       stream_ptrs_forc.data.as_voidptr
+                       stream_ptrs_forc.data.as_voidptr,
+                       future_id
                        )
 
 # first callback from Charm++ shared library
@@ -965,17 +970,18 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD
                            long *devBufSizes, void *devBufs, int msgSize,
                            char *msg, int dcopy_start):
     cdef int idx = 0
-    cdef void *bptr
     try:
       if PROFILING:
         charm._precvtime = time.time()
         charm.recordReceive(msgSize)
       devBufInfo = []
       for idx in range(numDevBuffs):
-        dev_buf = devBufs[idx]
-        devBufInfo.append((devBufSizes[idx], dev_buf))
+        # Add the size of this buffer and a pointer to it to the info list
+        devBufInfo.append((devBufSizes[idx],
+                           <long>(devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx)))
+                          )
       recv_buffer.setMsg(msg, msgSize)
-      charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, recv_buffer, msg, dcopy_start)
+      charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start)
     except:
       charm.handleGeneralError()
 
@@ -1037,6 +1043,12 @@ cdef void resumeFromSync(int aid, int ndims, int *arrayIndex):
   except:
     charm.handleGeneralError()
 
+cdef void depositFutureWithId(void *param, void *msg):
+# TODO: Figure out how this param value should be allocated/deallocated
+  cdef int futureId = (<int*> param)[0]
+  free(param)
+  charm._future_deposit_result(futureId)
+
 cdef void createCallbackMsg(void *data, int dataSize, int reducerType, int fid, int *sectionInfo,
                             char **returnBuffers, int *returnBufferSizes):
   cdef int numElems

From e012722919c2104c668d4b39811603f890f54c66 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sun, 17 Jan 2021 10:28:51 -0500
Subject: [PATCH 021/107] WIP

---
 tests/benchmark/pingpong_gpu.py | 36 ++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 9ade4ecb..30310db7 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -1,16 +1,16 @@
-from charm4py import charm, Chare, Array, coro, Future, Channel, Group
+from charm4py import charm, Chare, Array, coro, Future, Channel, Group, ArrayMap
 import time
 import numpy as np
 from numba import cuda
 
-USE_PINNED = True
+USE_PINNED = False
 
 class Ping(Chare):
     def __init__(self, use_gpudirect, print_format):
         self.gpu_direct = use_gpudirect
         self.num_chares = charm.numPes()
         self.print_format = print_format
-        self.am_low_chare = self.thisIndex == 0
+        self.am_low_chare = self.thisIndex[0] == 0
 
         if self.am_low_chare:
             if print_format == 0:
@@ -27,10 +27,13 @@ def do_iteration(self, message_size, num_iters, done_future):
         if USE_PINNED:
             h_data = cuda.pinned_array(message_size, dtype='int8')
         else:
-            h_data = np.zeros(message_size, dtype='int8')
+            if self.am_low_chare:
+                h_data = np.ones(message_size, dtype='int8')
+            else:
+                h_data = np.zeros(message_size, dtype='int8')
         d_data = cuda.device_array(message_size, dtype='int8')
         d_data.copy_to_device(h_data)
-        partner_idx = int(not self.thisIndex)
+        partner_idx = int(not self.thisIndex[0])
         partner = self.thisProxy[partner_idx]
         partner_channel = Channel(self, partner)
 
@@ -40,11 +43,14 @@ def do_iteration(self, message_size, num_iters, done_future):
             if self.am_low_chare:
                 if not self.gpu_direct:
                     d_data.copy_to_host(h_data)
-                    # partner_channel.send(dev_array)
                     partner_channel.send(h_data)
                     d_data.copy_to_device(partner_channel.recv())
                 else:
-                    raise NotImplementedError("TODO: GPU Direct")
+                    partner_channel.send(d_data)
+                    break
+                    # partner_channel.recv(d_data)
+                    # sleep because callbacks not implemented yet
+                    # charm.sleep(0.15)
 
             else:
                 if not self.gpu_direct:
@@ -52,7 +58,10 @@ def do_iteration(self, message_size, num_iters, done_future):
                     d_data.copy_to_host(h_data)
                     partner_channel.send(h_data)
                 else:
-                    raise NotImplementedError("TODO: GPU Direct")
+                    partner_channel.recv(d_data)
+                    # d_data.copy_to_host(h_data)
+                    # print(h_data[0])
+                    # partner_channel.send(d_data)
 
         tend = time.time()
 
@@ -76,13 +85,19 @@ def display_iteration_data(self, elapsed_time, num_iters, message_size):
                   f'{elapsed_time * 1e6: <20} {bandwidth / 1e6: <20}'
                   )
 
+
+class ArrMap(ArrayMap):
+    def procNum(self, index):
+        return index[0] % 2
+
+
 def main(args):
     if len(args) < 7:
         print("Doesn't have the required input params. Usage:"
               "<min-msg-size> <max-msg-size> <low-iter> "
               "<high-iter> <print-format"
               "(0 for csv, 1 for "
-              "regular)> <GPU (0 for CPU, 1 for GPU)>\n"
+              "regular)> <GPU (0 for host staging, 1 for GPU Direct)>\n"
               )
         charm.exit(-1)
 
@@ -93,7 +108,8 @@ def main(args):
     print_format = int(args[5])
     use_gpudirect = int(args[6])
 
-    pings = Group(Ping, args=[use_gpudirect, print_format])
+    peMap = Group(ArrMap)
+    pings = Array(Ping, 2, args=[use_gpudirect, print_format], map = peMap)
     charm.awaitCreation(pings)
     msg_size = min_msg_size
 

From fa2072596bb3f82ecd2f5913259dacf1c253bcc5 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 18 Jan 2021 17:23:27 -0500
Subject: [PATCH 022/107] fix datatype passed to CkGetGPUDirectData

---
 charm4py/charmlib/charmlib_cython.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index f62be726..8801fcc2 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -908,10 +908,10 @@ class CharmLib(object):
       remote_buf_ptrs[idx] = remote_bufs[idx][1]
       stream_ptrs_forc[idx] = stream_ptrs[idx]
       # what do we do about the return future? Need to turn it into some callback.
-    CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr,
+    CkGetGPUDirectData(num_buffers, <void*><long>recv_buf_ptrs[0],
                        <int*> recv_buf_sizes.data.as_voidptr,
-                       remote_buf_ptrs.data.as_voidptr,
-                       stream_ptrs_forc.data.as_voidptr,
+                       <void*><long> remote_buf_ptrs[0],
+                       <void*><long> stream_ptrs_forc[0],
                        future_id
                        )
 
@@ -978,7 +978,7 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD
       for idx in range(numDevBuffs):
         # Add the size of this buffer and a pointer to it to the info list
         devBufInfo.append((devBufSizes[idx],
-                           <long>(devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx)))
+                           <long>devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx))
                           )
       recv_buffer.setMsg(msg, msgSize)
       charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start)

From a952d25f520e13d1c1d71430ff31d659f3ff7d02 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Mon, 18 Jan 2021 18:35:52 -0500
Subject: [PATCH 023/107] Temporary fix for requiring libmpi.so

---
 charm4py/charm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index 6bb3f28a..fea8a48f 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -111,11 +111,13 @@ def __init__(self):
         self.options.interactive.verbose = 1
         self.options.interactive.broadcast_imports = True
 
+        '''
         if 'OMPI_COMM_WORLD_SIZE' in os.environ:
             # this is needed for OpenMPI, see:
             # https://svn.open-mpi.org/trac/ompi/wiki/Linkers
             import ctypes
             self.__libmpi__ = ctypes.CDLL('libmpi.so', mode=ctypes.RTLD_GLOBAL)
+        '''
         self.lib = load_charm_library(self)
         self.ReducerType = self.lib.ReducerType
         self.CkContributeToChare = self.lib.CkContributeToChare

From 266b19f9917e42d04db9e45f870bc2b38a1f6b5d Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Mon, 18 Jan 2021 19:01:11 -0500
Subject: [PATCH 024/107] Remove break from GPU pingpong benchmark

---
 tests/benchmark/pingpong_gpu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 30310db7..1e7a8eca 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -47,7 +47,6 @@ def do_iteration(self, message_size, num_iters, done_future):
                     d_data.copy_to_device(partner_channel.recv())
                 else:
                     partner_channel.send(d_data)
-                    break
                     # partner_channel.recv(d_data)
                     # sleep because callbacks not implemented yet
                     # charm.sleep(0.15)

From 25d3ccedadddde5c9bed46d5b7ed7df769a699ce Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 18 Jan 2021 17:23:27 -0500
Subject: [PATCH 025/107] fix datatype passed to CkGetGPUDirectData

---
 charm4py/charmlib/charmlib_cython.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index f62be726..8801fcc2 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -908,10 +908,10 @@ class CharmLib(object):
       remote_buf_ptrs[idx] = remote_bufs[idx][1]
       stream_ptrs_forc[idx] = stream_ptrs[idx]
       # what do we do about the return future? Need to turn it into some callback.
-    CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr,
+    CkGetGPUDirectData(num_buffers, <void*><long>recv_buf_ptrs[0],
                        <int*> recv_buf_sizes.data.as_voidptr,
-                       remote_buf_ptrs.data.as_voidptr,
-                       stream_ptrs_forc.data.as_voidptr,
+                       <void*><long> remote_buf_ptrs[0],
+                       <void*><long> stream_ptrs_forc[0],
                        future_id
                        )
 
@@ -978,7 +978,7 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD
       for idx in range(numDevBuffs):
         # Add the size of this buffer and a pointer to it to the info list
         devBufInfo.append((devBufSizes[idx],
-                           <long>(devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx)))
+                           <long>devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx))
                           )
       recv_buffer.setMsg(msg, msgSize)
       charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start)

From 663ac5a26000da96e267064e109f9f25067fc17e Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Wed, 20 Jan 2021 00:23:47 -0500
Subject: [PATCH 026/107] Debugging, fixed future ID and passing pointers to
 CkGetGPUDirectData

---
 charm4py/channel.py                   |  1 +
 charm4py/charm.py                     |  2 ++
 charm4py/charmlib/charmlib_cython.pyx | 19 +++++++++++++++++--
 tests/benchmark/pingpong_gpu.py       |  2 ++
 4 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index c7883a7f..08bd1696 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -75,6 +75,7 @@ def recv(self, *post_buffers, stream_ptrs = None):
             ret = ret[:-1:1]
             assert len(post_buffers) == len(gpu_recv_bufs)
             recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
+            print('recv_future fid', recv_future.fid)
             recv_future.get()
         return ret
 
diff --git a/charm4py/charm.py b/charm4py/charm.py
index fea8a48f..fc86fef6 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -316,6 +316,7 @@ def recvArrayMsg(self, aid, index, ep, msg, dcopy_start):
     def recvGPUDirectMsg(self, aid, index, ep,
                          devBuf_ptrs, msg, dcopy_start
                          ):
+        print('recvGPUDirectMsg')
         obj = self.arrays[aid][index]
         header, args = self.unpackMsg(msg, dcopy_start, obj)
         args.append(devBuf_ptrs)
@@ -350,6 +351,7 @@ def unpackMsg(self, msg, dcopy_start, dest_obj):
         return header, args
 
     def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs):
+        print('getGPUDirectData')
         return_fut = self.Future(len(post_buffers))
         post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers]
         if not stream_ptrs:
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 8801fcc2..8f2a0a51 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -465,6 +465,7 @@ class CharmLib(object):
   def CkArraySendWithDeviceData(self, int array_id, index not None, int ep,
                                 msg not None, list stream_ptrs):
 
+    print('CkArraySendWithDeviceData')
     global gpu_direct_buf_idx
     cdef int i = 0
     cdef int ndims = len(index)
@@ -887,8 +888,10 @@ class CharmLib(object):
     CcdCallFnAfter(CcdCallFnAfterCallback, <void*>tag, msecs)
 
   def getGPUDirectData(self, list post_buf_data, list remote_bufs, list stream_ptrs, return_fut):
+    print('charmlib getGPUDirectData')
     cdef int num_buffers = len(post_buf_data)
     cdef int *future_id = <int*> malloc(sizeof(int))
+    future_id[0] = return_fut.fid
     cdef array.array int_array_template = array.array('i', [])
     cdef array.array long_array_template = array.array('L', [])
     cdef array.array recv_buf_sizes
@@ -907,13 +910,22 @@ class CharmLib(object):
       recv_buf_sizes[idx] = remote_bufs[idx][0]
       remote_buf_ptrs[idx] = remote_bufs[idx][1]
       stream_ptrs_forc[idx] = stream_ptrs[idx]
-      # what do we do about the return future? Need to turn it into some callback.
+      print("Op %d: dest ptr: %02x, size: %d, src DeviceBuffer ptr: %02x, stream ptr: %02x\n" %(idx, recv_buf_ptrs[idx], recv_buf_sizes[idx], remote_buf_ptrs[idx], stream_ptrs_forc[idx]));
+    # what do we do about the return future? Need to turn it into some callback.
+    '''
     CkGetGPUDirectData(num_buffers, <void*><long>recv_buf_ptrs[0],
                        <int*> recv_buf_sizes.data.as_voidptr,
                        <void*><long> remote_buf_ptrs[0],
                        <void*><long> stream_ptrs_forc[0],
                        future_id
                        )
+                       '''
+    CkGetGPUDirectData(num_buffers, <void*>recv_buf_ptrs.data.as_voidptr,
+                       <int*>recv_buf_sizes.data.as_voidptr,
+                       <void*>remote_buf_ptrs.data.as_voidptr,
+                       <void*>stream_ptrs_forc.data.as_voidptr,
+                       future_id
+                       )
 
 # first callback from Charm++ shared library
 cdef void registerMainModule():
@@ -1047,7 +1059,10 @@ cdef void depositFutureWithId(void *param, void *msg):
 # TODO: Figure out how this param value should be allocated/deallocated
   cdef int futureId = (<int*> param)[0]
   free(param)
-  charm._future_deposit_result(futureId)
+  #charm._future_deposit_result(futureId)
+  print('depositFutureWithId', futureId)
+  charm.thisProxy[1]._future_deposit_result(futureId)
+
 
 cdef void createCallbackMsg(void *data, int dataSize, int reducerType, int fid, int *sectionInfo,
                             char **returnBuffers, int *returnBufferSizes):
diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 1e7a8eca..6ae0caa9 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -46,6 +46,7 @@ def do_iteration(self, message_size, num_iters, done_future):
                     partner_channel.send(h_data)
                     d_data.copy_to_device(partner_channel.recv())
                 else:
+                    print("Sending ", hex(d_data.device_ctypes_pointer.value))
                     partner_channel.send(d_data)
                     # partner_channel.recv(d_data)
                     # sleep because callbacks not implemented yet
@@ -57,6 +58,7 @@ def do_iteration(self, message_size, num_iters, done_future):
                     d_data.copy_to_host(h_data)
                     partner_channel.send(h_data)
                 else:
+                    print("Receiving ", hex(d_data.device_ctypes_pointer.value))
                     partner_channel.recv(d_data)
                     # d_data.copy_to_host(h_data)
                     # print(h_data[0])

From 9000ec3710984ee04cf60d6686050c66b603a3ab Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 20 Jan 2021 16:18:40 -0500
Subject: [PATCH 027/107] call local charm object, not charm remote when
 depositing GPU recv future

---
 charm4py/charm.py                     | 4 ++++
 charm4py/charmlib/charmlib_cython.pyx | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index fc86fef6..82968833 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -359,6 +359,10 @@ def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs):
         self.lib.getGPUDirectData(post_buf_data, remote_bufs, stream_ptrs, return_fut)
         return return_fut
 
+    # deposit value of one of the futures that was created on this PE
+    def _future_deposit_result(self, fid, result=None):
+        self.threadMgr.depositFuture(fid, result)
+
     def packMsg(self, destObj, msgArgs, header):
         """Prepares a message for sending, given arguments to an entry method invocation.
 
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 8f2a0a51..c3912119 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -1061,7 +1061,7 @@ cdef void depositFutureWithId(void *param, void *msg):
   free(param)
   #charm._future_deposit_result(futureId)
   print('depositFutureWithId', futureId)
-  charm.thisProxy[1]._future_deposit_result(futureId)
+  charm._future_deposit_result(futureId)
 
 
 cdef void createCallbackMsg(void *data, int dataSize, int reducerType, int fid, int *sectionInfo,

From baa180b3f9e39148256918bb67f4f2ad01e129ee Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Wed, 20 Jan 2021 23:58:19 -0500
Subject: [PATCH 028/107] Remove debugging print statements

---
 charm4py/channel.py                   |  1 -
 charm4py/charm.py                     |  2 --
 charm4py/charmlib/charmlib_cython.pyx | 13 -------------
 tests/benchmark/pingpong_gpu.py       |  2 --
 4 files changed, 18 deletions(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index 08bd1696..c7883a7f 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -75,7 +75,6 @@ def recv(self, *post_buffers, stream_ptrs = None):
             ret = ret[:-1:1]
             assert len(post_buffers) == len(gpu_recv_bufs)
             recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
-            print('recv_future fid', recv_future.fid)
             recv_future.get()
         return ret
 
diff --git a/charm4py/charm.py b/charm4py/charm.py
index 82968833..a37fd7a8 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -316,7 +316,6 @@ def recvArrayMsg(self, aid, index, ep, msg, dcopy_start):
     def recvGPUDirectMsg(self, aid, index, ep,
                          devBuf_ptrs, msg, dcopy_start
                          ):
-        print('recvGPUDirectMsg')
         obj = self.arrays[aid][index]
         header, args = self.unpackMsg(msg, dcopy_start, obj)
         args.append(devBuf_ptrs)
@@ -351,7 +350,6 @@ def unpackMsg(self, msg, dcopy_start, dest_obj):
         return header, args
 
     def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs):
-        print('getGPUDirectData')
         return_fut = self.Future(len(post_buffers))
         post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers]
         if not stream_ptrs:
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index c3912119..f20155b1 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -465,7 +465,6 @@ class CharmLib(object):
   def CkArraySendWithDeviceData(self, int array_id, index not None, int ep,
                                 msg not None, list stream_ptrs):
 
-    print('CkArraySendWithDeviceData')
     global gpu_direct_buf_idx
     cdef int i = 0
     cdef int ndims = len(index)
@@ -888,7 +887,6 @@ class CharmLib(object):
     CcdCallFnAfter(CcdCallFnAfterCallback, <void*>tag, msecs)
 
   def getGPUDirectData(self, list post_buf_data, list remote_bufs, list stream_ptrs, return_fut):
-    print('charmlib getGPUDirectData')
     cdef int num_buffers = len(post_buf_data)
     cdef int *future_id = <int*> malloc(sizeof(int))
     future_id[0] = return_fut.fid
@@ -910,16 +908,7 @@ class CharmLib(object):
       recv_buf_sizes[idx] = remote_bufs[idx][0]
       remote_buf_ptrs[idx] = remote_bufs[idx][1]
       stream_ptrs_forc[idx] = stream_ptrs[idx]
-      print("Op %d: dest ptr: %02x, size: %d, src DeviceBuffer ptr: %02x, stream ptr: %02x\n" %(idx, recv_buf_ptrs[idx], recv_buf_sizes[idx], remote_buf_ptrs[idx], stream_ptrs_forc[idx]));
     # what do we do about the return future? Need to turn it into some callback.
-    '''
-    CkGetGPUDirectData(num_buffers, <void*><long>recv_buf_ptrs[0],
-                       <int*> recv_buf_sizes.data.as_voidptr,
-                       <void*><long> remote_buf_ptrs[0],
-                       <void*><long> stream_ptrs_forc[0],
-                       future_id
-                       )
-                       '''
     CkGetGPUDirectData(num_buffers, <void*>recv_buf_ptrs.data.as_voidptr,
                        <int*>recv_buf_sizes.data.as_voidptr,
                        <void*>remote_buf_ptrs.data.as_voidptr,
@@ -1059,8 +1048,6 @@ cdef void depositFutureWithId(void *param, void *msg):
 # TODO: Figure out how this param value should be allocated/deallocated
   cdef int futureId = (<int*> param)[0]
   free(param)
-  #charm._future_deposit_result(futureId)
-  print('depositFutureWithId', futureId)
   charm._future_deposit_result(futureId)
 
 
diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 6ae0caa9..1e7a8eca 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -46,7 +46,6 @@ def do_iteration(self, message_size, num_iters, done_future):
                     partner_channel.send(h_data)
                     d_data.copy_to_device(partner_channel.recv())
                 else:
-                    print("Sending ", hex(d_data.device_ctypes_pointer.value))
                     partner_channel.send(d_data)
                     # partner_channel.recv(d_data)
                     # sleep because callbacks not implemented yet
@@ -58,7 +57,6 @@ def do_iteration(self, message_size, num_iters, done_future):
                     d_data.copy_to_host(h_data)
                     partner_channel.send(h_data)
                 else:
-                    print("Receiving ", hex(d_data.device_ctypes_pointer.value))
                     partner_channel.recv(d_data)
                     # d_data.copy_to_host(h_data)
                     # print(h_data[0])

From 1061a1127b59fb6c4da8c133ee19e8c355d1d7d5 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 21 Jan 2021 10:09:11 -0500
Subject: [PATCH 029/107] put the pong back in the benchmark

---
 tests/benchmark/pingpong_gpu.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 1e7a8eca..3e5ce291 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -47,9 +47,7 @@ def do_iteration(self, message_size, num_iters, done_future):
                     d_data.copy_to_device(partner_channel.recv())
                 else:
                     partner_channel.send(d_data)
-                    # partner_channel.recv(d_data)
-                    # sleep because callbacks not implemented yet
-                    # charm.sleep(0.15)
+                    partner_channel.recv(d_data)
 
             else:
                 if not self.gpu_direct:
@@ -58,9 +56,7 @@ def do_iteration(self, message_size, num_iters, done_future):
                     partner_channel.send(h_data)
                 else:
                     partner_channel.recv(d_data)
-                    # d_data.copy_to_host(h_data)
-                    # print(h_data[0])
-                    # partner_channel.send(d_data)
+                    partner_channel.send(d_data)
 
         tend = time.time()
 
@@ -112,6 +108,12 @@ def main(args):
     charm.awaitCreation(pings)
     msg_size = min_msg_size
 
+    # do a warmup iteration (should this be done for each size?)
+    done_future = Future()
+    pings.do_iteration(msg_size, high_iter, done_future)
+    done_future.get()
+
+
     while msg_size <= max_msg_size:
         if msg_size <= 1048576:
             iter = low_iter

From a6c3684128e8712784899283eb707ef2c3ea0510 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Thu, 21 Jan 2021 13:07:47 -0500
Subject: [PATCH 030/107] Separate send and recv buffers in GPU pingpong

---
 tests/benchmark/pingpong_gpu.py | 35 +++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 3e5ce291..c90b0d0a 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -25,14 +25,19 @@ def __init__(self, use_gpudirect, print_format):
     @coro
     def do_iteration(self, message_size, num_iters, done_future):
         if USE_PINNED:
-            h_data = cuda.pinned_array(message_size, dtype='int8')
+            h_data_send = cuda.pinned_array(message_size, dtype='int8')
+            h_data_recv = cuda.pinned_array(message_size, dtype='int8')
         else:
             if self.am_low_chare:
-                h_data = np.ones(message_size, dtype='int8')
+                h_data_send = np.ones(message_size, dtype='int8')
+                h_data_recv = np.ones(message_size, dtype='int8')
             else:
-                h_data = np.zeros(message_size, dtype='int8')
-        d_data = cuda.device_array(message_size, dtype='int8')
-        d_data.copy_to_device(h_data)
+                h_data_send = np.zeros(message_size, dtype='int8')
+                h_data_recv = np.zeros(message_size, dtype='int8')
+        d_data_send = cuda.device_array(message_size, dtype='int8')
+        d_data_recv = cuda.device_array(message_size, dtype='int8')
+        d_data_send.copy_to_device(h_data_send)
+        d_data_recv.copy_to_device(h_data_recv)
         partner_idx = int(not self.thisIndex[0])
         partner = self.thisProxy[partner_idx]
         partner_channel = Channel(self, partner)
@@ -42,21 +47,21 @@ def do_iteration(self, message_size, num_iters, done_future):
         for _ in range(num_iters):
             if self.am_low_chare:
                 if not self.gpu_direct:
-                    d_data.copy_to_host(h_data)
-                    partner_channel.send(h_data)
-                    d_data.copy_to_device(partner_channel.recv())
+                    d_data_send.copy_to_host(h_data_send)
+                    partner_channel.send(h_data_send)
+                    d_data_recv.copy_to_device(partner_channel.recv())
                 else:
-                    partner_channel.send(d_data)
-                    partner_channel.recv(d_data)
+                    partner_channel.send(d_data_send)
+                    partner_channel.recv(d_data_recv)
 
             else:
                 if not self.gpu_direct:
-                    d_data.copy_to_device(partner_channel.recv())
-                    d_data.copy_to_host(h_data)
-                    partner_channel.send(h_data)
+                    d_data_recv.copy_to_device(partner_channel.recv())
+                    d_data_send.copy_to_host(h_data_send)
+                    partner_channel.send(h_data_send)
                 else:
-                    partner_channel.recv(d_data)
-                    partner_channel.send(d_data)
+                    partner_channel.recv(d_data_recv)
+                    partner_channel.send(d_data_send)
 
         tend = time.time()
 

From 3d2fd1544b30bf73fd6fa17d5bb10ff42edb3df5 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sat, 23 Jan 2021 11:24:19 -0500
Subject: [PATCH 031/107] use pinned memory by default, add warmup iterations

---
 tests/benchmark/pingpong_gpu.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index c90b0d0a..27ce0a4f 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -3,7 +3,8 @@
 import numpy as np
 from numba import cuda
 
-USE_PINNED = False
+USE_PINNED = True
+WARMUP_ITERS = 10
 
 class Ping(Chare):
     def __init__(self, use_gpudirect, print_format):
@@ -34,6 +35,7 @@ def do_iteration(self, message_size, num_iters, done_future):
             else:
                 h_data_send = np.zeros(message_size, dtype='int8')
                 h_data_recv = np.zeros(message_size, dtype='int8')
+
         d_data_send = cuda.device_array(message_size, dtype='int8')
         d_data_recv = cuda.device_array(message_size, dtype='int8')
         d_data_send.copy_to_device(h_data_send)
@@ -42,6 +44,26 @@ def do_iteration(self, message_size, num_iters, done_future):
         partner = self.thisProxy[partner_idx]
         partner_channel = Channel(self, partner)
 
+        for _ in range(WARMUP_ITERS):
+            if self.am_low_chare:
+                if not self.gpu_direct:
+                    d_data_send.copy_to_host(h_data_send)
+                    partner_channel.send(h_data_send)
+                    d_data_recv.copy_to_device(partner_channel.recv())
+                else:
+                    partner_channel.send(d_data_send)
+                    partner_channel.recv(d_data_recv)
+
+            else:
+                if not self.gpu_direct:
+                    d_data_recv.copy_to_device(partner_channel.recv())
+                    d_data_send.copy_to_host(h_data_send)
+                    partner_channel.send(h_data_send)
+                else:
+                    partner_channel.recv(d_data_recv)
+                    partner_channel.send(d_data_send)
+
+
         tstart = time.time()
 
         for _ in range(num_iters):

From 36777386f91b3eac13d7bfe6d104f9f496a2f55b Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sat, 23 Jan 2021 11:26:56 -0500
Subject: [PATCH 032/107] remove warmup for only the first size

---
 tests/benchmark/pingpong_gpu.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 27ce0a4f..e4474e88 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -135,12 +135,6 @@ def main(args):
     charm.awaitCreation(pings)
     msg_size = min_msg_size
 
-    # do a warmup iteration (should this be done for each size?)
-    done_future = Future()
-    pings.do_iteration(msg_size, high_iter, done_future)
-    done_future.get()
-
-
     while msg_size <= max_msg_size:
         if msg_size <= 1048576:
             iter = low_iter

From aac9651991efa5ea5b8614f78814ec1d9d1d8b15 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sat, 23 Jan 2021 18:24:41 -0500
Subject: [PATCH 033/107] send, recv can now have the pointers set directly

---
 charm4py/channel.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index c7883a7f..e187f19f 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -1,5 +1,6 @@
 from .threads import LocalFuture
 from .charm import charm
+import time
 
 
 class Channel(object):
@@ -53,15 +54,15 @@ def ready(self):
     def waitReady(self, f):
         self.wait_ready = f
 
-    def send(self, *msg):
+    def send(self, *msg, **kwargs):
         if not self.established:
             self.established_fut = LocalFuture()
             self.established_fut.get()
             self.setEstablished()
-        self.remote._channelRecv__(self.remote_port, self.send_seqno, *msg)
+        self.remote._channelRecv__(self.remote_port, self.send_seqno, *msg, **kwargs)
         self.send_seqno = (self.send_seqno + 1) % CHAN_BUF_SIZE
 
-    def recv(self, *post_buffers, stream_ptrs = None):
+    def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None, stream_ptrs = None):
         if self.recv_seqno in self.data:
             ret = self.data.pop(self.recv_seqno)
         else:
@@ -72,10 +73,19 @@ def recv(self, *post_buffers, stream_ptrs = None):
 
         if post_buffers:
             gpu_recv_bufs = ret[-1]
-            ret = ret[:-1:1]
+            # ret = ret[:-1:1]
             assert len(post_buffers) == len(gpu_recv_bufs)
             recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
             recv_future.get()
+        elif post_buf_addresses != None:
+            gpu_recv_bufs = ret
+            # ret = ret[:-1:1]
+            assert len(post_buffers) == len(gpu_recv_bufs)
+            assert post_buf_sizes
+            recv_future = charm.getGPUDirectDataFromAddresses(post_buf_addresses, post_buf_sizes, gpu_recv_bufs, stream_ptrs)
+            recv_future.get()
+
+
         return ret
 
 

From 8e1c02fd7d87f8853d3e5a8622b98ccab1e7d9d9 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sat, 23 Jan 2021 18:25:06 -0500
Subject: [PATCH 034/107] add calls to CkArraySendWithDeviceDataFromPointers
 when applicable

---
 charm4py/chare.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/charm4py/chare.py b/charm4py/chare.py
index 58ab2c06..35d53bd8 100644
--- a/charm4py/chare.py
+++ b/charm4py/chare.py
@@ -722,7 +722,7 @@ def proxy_entry_method(proxy, *args, **kwargs):
                 argname = argnames[i]
                 # first look for argument in kwargs
                 # TODO: Should stream_ptrs be skipped?
-                if argname == 'stream_ptrs':
+                if argname in {'stream_ptrs', 'gpu_src_ptrs', 'gpu_src_sizes'}:
                     continue
                 if argname in kwargs and argname:
                     args.append(kwargs[argname])
@@ -751,15 +751,27 @@ def proxy_entry_method(proxy, *args, **kwargs):
                 array = charm.arrays[aid]
                 if elemIdx in array:
                     destObj = array[elemIdx]
-            msg = charm.packMsg(destObj, args, header)
-            if msg[1]:
-                if 'stream_ptrs' in kwargs and kwargs['stream_ptrs']:
+            should_pack_gpu = True
+            if 'gpu_src_ptrs' in kwargs:
+                should_pack_gpu = False
+            msg = charm.packMsg(destObj, args, header, pack_gpu = should_pack_gpu)
+            if msg[1] or not should_pack_gpu:
+                if 'stream_ptrs' in kwargs:
                     stream_ptrs = kwargs['stream_ptrs']
                 else:
                     stream_ptrs = None
-                charm.CkArraySendWithDeviceData(aid, elemIdx, ep,
-                                                msg, stream_ptrs
-                                                )
+                if should_pack_gpu:
+                    charm.CkArraySendWithDeviceData(aid, elemIdx, ep,
+                                                    msg, stream_ptrs
+                                                    )
+                else:
+                    charm.CkArraySendWithDeviceDataFromPointers(aid, elemIdx, ep,
+                                                                msg, kwargs['gpu_src_ptrs'],
+                                                                kwargs['gpu_src_sizes'],
+                                                                stream_ptrs,
+                                                                len(kwargs['gpu_src_ptrs'])
+                                                    )
+
             else:
                 charm.CkArraySend(aid, elemIdx, ep, msg)
         else:

From 71004957dc47eee95b98546ebd8fe2fb48eb6ee2 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sat, 23 Jan 2021 18:25:48 -0500
Subject: [PATCH 035/107] update DirectCopy API so buffer info is not always
 gathered

---
 charm4py/charm.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index a37fd7a8..f1e87523 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -28,6 +28,7 @@
 from . import reduction
 from . import wait
 import array
+import numpy as np
 try:
     import numpy
 except ImportError:
@@ -46,6 +47,9 @@ def getDeviceDataInfo(devArray):
 def getDeviceDataAddress(devArray):
     return getDeviceDataInfo(devArray)[0]
 
+def getDeviceDataSizeInBytes(devArray):
+    return devArray.nbytes
+
 class Options(object):
 
     def __str__(self):
@@ -128,6 +132,7 @@ def __init__(self):
         self.CkGroupSend = self.lib.CkGroupSend
         self.CkArraySend = self.lib.CkArraySend
         self.CkArraySendWithDeviceData = self.lib.CkArraySendWithDeviceData
+        self.CkArraySendWithDeviceDataFromPointers = self.lib.CkArraySendWithDeviceDataFromPointers
         self.reducers = reduction.ReducerContainer(self)
         self.redMgr = reduction.ReductionManager(self, self.reducers)
         self.mainchareRegistered = False
@@ -319,6 +324,7 @@ def recvGPUDirectMsg(self, aid, index, ep,
         obj = self.arrays[aid][index]
         header, args = self.unpackMsg(msg, dcopy_start, obj)
         args.append(devBuf_ptrs)
+
         self.invokeEntryMethod(obj, ep, header, args)
 
     def recvArrayBcast(self, aid, indexes, ep, msg, dcopy_start):
@@ -352,9 +358,17 @@ def unpackMsg(self, msg, dcopy_start, dest_obj):
     def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs):
         return_fut = self.Future(len(post_buffers))
         post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers]
+        post_buf_sizes = [getDeviceDataSizeInBytes(buf) for buf in post_buffers]
         if not stream_ptrs:
             stream_ptrs = [0] * len(post_buffers)
-        self.lib.getGPUDirectData(post_buf_data, remote_bufs, stream_ptrs, return_fut)
+        self.lib.getGPUDirectData(post_buf_data, post_buf_sizes, remote_bufs, stream_ptrs, return_fut)
+        return return_fut
+
+    def getGPUDirectDataFromAddresses(self, post_buf_ptrs, post_buf_sizes, remote_bufs, stream_ptrs):
+        return_fut = self.Future(len(post_buf_ptrs))
+        if not stream_ptrs:
+            stream_ptrs = array.array('L', [0] * len(post_buf_ptrs))
+        self.lib.getGPUDirectDataFromAddresses(post_buf_ptrs, post_buf_sizes, remote_bufs, stream_ptrs, return_fut)
         return return_fut
 
     # deposit value of one of the futures that was created on this PE

From 61603ad0b5e7867746eb866c86f2446772251ed7 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sat, 23 Jan 2021 18:31:17 -0500
Subject: [PATCH 036/107] update CkGetGPUDirectData to use future value

---
 charm4py/charmlib/ccharm.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index 7f2a34b2..ce06826d 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -83,7 +83,7 @@ cdef extern from "charm.h":
 
     void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int));
     void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes,
-                            void *remoteBufInfo, void *streamPtrs, int *futureId);
+                            void *remoteBufInfo, void *streamPtrs, int futureId);
 
     int CkDeviceBufferSizeInBytes();
 

From 30a5340385f8830db06a197f074c5546da42e974 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sat, 23 Jan 2021 18:31:46 -0500
Subject: [PATCH 037/107] GPUDirect buffers can now come from device pointers

---
 charm4py/charmlib/charmlib_cython.pyx | 82 ++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 21 deletions(-)

diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index f20155b1..2eb7cde4 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -492,6 +492,34 @@ class CharmLib(object):
                                  )
     gpu_direct_buf_idx = 0
 
+  def CkArraySendWithDeviceDataFromPointers(self, int array_id, index not None, int ep,
+                                                msg not None, array.array gpu_src_ptrs,
+                                                array.array gpu_src_sizes,
+                                                list stream_ptrs, int num_bufs):
+
+    cdef int i = 0
+    cdef int ndims = len(index)
+    # assert ndims == 1
+    for i in range(ndims): c_index[i] = index[i]
+    msg0, dcopy = msg
+    dcopy = None
+
+    if stream_ptrs:
+      for i in range(num_bufs):
+        gpu_direct_stream_ptrs[i] = stream_ptrs[i]
+    else:
+      memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs)
+
+    CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0),
+                                 <long*>gpu_src_ptrs.data.as_voidptr,
+                                 <long*>gpu_src_sizes.data.as_voidptr,
+                                 gpu_direct_stream_ptrs,
+                                 num_bufs
+                                 )
+    gpu_direct_buf_idx = 0
+
+
+
   def CkArraySend(self, int array_id, index not None, int ep, msg not None):
     global cur_buf
     msg0, dcopy = msg
@@ -816,7 +844,7 @@ class CharmLib(object):
 
     return header, args
 
-  def packMsg(self, destObj, msgArgs not None, dict header):
+  def packMsg(self, destObj, msgArgs not None, dict header, pack_gpu=True):
     cdef int i = 0
     cdef int localTag
     cdef array.array a
@@ -841,13 +869,15 @@ class CharmLib(object):
       for i in range(len(args)):
         arg = msgArgs[i]
         if CkCudaEnabled() and hasattr(arg, '__cuda_array_interface__'):
-          # we want to take the args that implement the cuda array interface and make them into ckdevicebuffers
-          # assumption: we can get nbytes from the arg directly
-          # TODO: verify this assertion for other types
-          gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg.__cuda_array_interface__['data'][0]
-          gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes
-          cuda_dev_info = True
-          gpu_direct_buf_idx += 1
+          if pack_gpu:
+            # we want to take the args that implement the cuda array interface and make them into ckdevicebuffers
+            # assumption: we can get nbytes from the arg directly
+            # TODO: verify this assertion for other types
+            # gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg.__cuda_array_interface__['data'][0]
+            gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg.__cuda_array_interface__['data'][0]
+            gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes
+            cuda_dev_info = True
+            gpu_direct_buf_idx += 1
           args[i] = None  # TODO: should this be done?
           continue
         elif isinstance(arg, np.ndarray) and not arg.dtype.hasobject:
@@ -886,10 +916,10 @@ class CharmLib(object):
   def scheduleTagAfter(self, int tag, double msecs):
     CcdCallFnAfter(CcdCallFnAfterCallback, <void*>tag, msecs)
 
-  def getGPUDirectData(self, list post_buf_data, list remote_bufs, list stream_ptrs, return_fut):
+
+  def getGPUDirectData(self, list post_buf_data, list post_buf_sizes, array.array remote_bufs, list stream_ptrs, return_fut):
     cdef int num_buffers = len(post_buf_data)
-    cdef int *future_id = <int*> malloc(sizeof(int))
-    future_id[0] = return_fut.fid
+    cdef int future_id = return_fut.fid
     cdef array.array int_array_template = array.array('i', [])
     cdef array.array long_array_template = array.array('L', [])
     cdef array.array recv_buf_sizes
@@ -905,8 +935,8 @@ class CharmLib(object):
 
     for idx in range(num_buffers):
       recv_buf_ptrs[idx] = post_buf_data[idx]
-      recv_buf_sizes[idx] = remote_bufs[idx][0]
-      remote_buf_ptrs[idx] = remote_bufs[idx][1]
+      recv_buf_sizes[idx] = post_buf_sizes[idx]
+      remote_buf_ptrs[idx] = remote_bufs[idx]
       stream_ptrs_forc[idx] = stream_ptrs[idx]
     # what do we do about the return future? Need to turn it into some callback.
     CkGetGPUDirectData(num_buffers, <void*>recv_buf_ptrs.data.as_voidptr,
@@ -916,6 +946,18 @@ class CharmLib(object):
                        future_id
                        )
 
+  def getGPUDirectDataFromAddresses(self, array.array post_buf_ptrs, array.array post_buf_sizes, array.array remote_bufs, array.array stream_ptrs, return_fut):
+    cdef int num_buffers = len(post_buf_ptrs)
+    cdef int future_id = return_fut.fid
+    # pointers from the remote that we will be issuing Rgets for
+    # these are pointers to type CkDeviceBuffer
+    CkGetGPUDirectData(num_buffers, <void*>post_buf_ptrs.data.as_voidptr,
+                       <int*>post_buf_sizes.data.as_voidptr,
+                       <void*>remote_bufs.data.as_voidptr,
+                       <void*>stream_ptrs.data.as_voidptr,
+                       future_id
+                       )
+
 # first callback from Charm++ shared library
 cdef void registerMainModule():
   try:
@@ -970,19 +1012,19 @@ cdef void recvArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int msgSize,
 cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numDevBuffs,
                            long *devBufSizes, void *devBufs, int msgSize,
                            char *msg, int dcopy_start):
+
     cdef int idx = 0
     try:
       if PROFILING:
         charm._precvtime = time.time()
         charm.recordReceive(msgSize)
-      devBufInfo = []
+      devBufInfo = array.array('L', [0] * numDevBuffs)
       for idx in range(numDevBuffs):
-        # Add the size of this buffer and a pointer to it to the info list
-        devBufInfo.append((devBufSizes[idx],
-                           <long>devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx))
-                          )
+        # Add the buffer's address to the list
+        devBufInfo[idx] = <long>devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx)
       recv_buffer.setMsg(msg, msgSize)
       charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start)
+
     except:
       charm.handleGeneralError()
 
@@ -1045,9 +1087,7 @@ cdef void resumeFromSync(int aid, int ndims, int *arrayIndex):
     charm.handleGeneralError()
 
 cdef void depositFutureWithId(void *param, void *msg):
-# TODO: Figure out how this param value should be allocated/deallocated
-  cdef int futureId = (<int*> param)[0]
-  free(param)
+  cdef int futureId = <int> param
   charm._future_deposit_result(futureId)
 
 

From 882eb986fe045db15141599cda911a77a9063bc2 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sat, 23 Jan 2021 18:32:02 -0500
Subject: [PATCH 038/107] add benchmark with new address optimization

---
 tests/benchmark/pingpong_gpu.py | 54 +++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index e4474e88..8c1e9e1a 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -2,8 +2,11 @@
 import time
 import numpy as np
 from numba import cuda
+import array
 
 USE_PINNED = True
+# provide the address/size data for GPU-direct addresses. Saves ~11us per iteration
+USE_ADDRESS_OPTIMIZATION = True
 WARMUP_ITERS = 10
 
 class Ping(Chare):
@@ -44,46 +47,51 @@ def do_iteration(self, message_size, num_iters, done_future):
         partner = self.thisProxy[partner_idx]
         partner_channel = Channel(self, partner)
 
-        for _ in range(WARMUP_ITERS):
-            if self.am_low_chare:
-                if not self.gpu_direct:
-                    d_data_send.copy_to_host(h_data_send)
-                    partner_channel.send(h_data_send)
-                    d_data_recv.copy_to_device(partner_channel.recv())
-                else:
-                    partner_channel.send(d_data_send)
-                    partner_channel.recv(d_data_recv)
+        if USE_ADDRESS_OPTIMIZATION:
+            d_data_recv_addr = array.array('L', [0])
+            d_data_recv_size = array.array('L', [0])
+            d_data_send_addr = array.array('L', [0])
+            d_data_send_size = array.array('L', [0])
 
-            else:
-                if not self.gpu_direct:
-                    d_data_recv.copy_to_device(partner_channel.recv())
-                    d_data_send.copy_to_host(h_data_send)
-                    partner_channel.send(h_data_send)
-                else:
-                    partner_channel.recv(d_data_recv)
-                    partner_channel.send(d_data_send)
+            d_data_recv_addr[0] = d_data_recv.__cuda_array_interface__['data'][0]
+            d_data_recv_size[0] = d_data_recv.nbytes
+            d_data_send_addr[0] = d_data_send.__cuda_array_interface__['data'][0]
+            d_data_send_size[0] = d_data_send.nbytes
 
 
         tstart = time.time()
 
-        for _ in range(num_iters):
+        for iternum in range(num_iters + WARMUP_ITERS):
+            if iternum == WARMUP_ITERS:
+                tstart = time.time()
             if self.am_low_chare:
                 if not self.gpu_direct:
                     d_data_send.copy_to_host(h_data_send)
                     partner_channel.send(h_data_send)
                     d_data_recv.copy_to_device(partner_channel.recv())
                 else:
-                    partner_channel.send(d_data_send)
-                    partner_channel.recv(d_data_recv)
-
+                    if USE_ADDRESS_OPTIMIZATION:
+                        partner_channel.send(gpu_src_ptrs = d_data_send_addr, gpu_src_sizes = d_data_send_size)
+                        partner_channel.recv(post_buf_addresses = d_data_recv_addr,
+                                             post_buf_sizes = d_data_recv_size
+                                             )
+                    else:
+                        partner_channel.send(d_data_send)
+                        partner_channel.recv(d_data_recv)
             else:
                 if not self.gpu_direct:
                     d_data_recv.copy_to_device(partner_channel.recv())
                     d_data_send.copy_to_host(h_data_send)
                     partner_channel.send(h_data_send)
                 else:
-                    partner_channel.recv(d_data_recv)
-                    partner_channel.send(d_data_send)
+                    if USE_ADDRESS_OPTIMIZATION:
+                        partner_channel.recv(post_buf_addresses = d_data_recv_addr,
+                                             post_buf_sizes = d_data_recv_size
+                                             )
+                        partner_channel.send(gpu_src_ptrs = d_data_send_addr, gpu_src_sizes = d_data_send_size)
+                    else:
+                        partner_channel.recv(d_data_recv)
+                        partner_channel.send(d_data_send)
 
         tend = time.time()
 

From 4be40c94236cb384c953b919353371f5a02aa4dc Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sun, 24 Jan 2021 14:34:00 -0500
Subject: [PATCH 039/107] update low/high iter breakpoint threshold according
 to experimental methodology

---
 tests/benchmark/pingpong_gpu.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 8c1e9e1a..8a65dcca 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -7,6 +7,7 @@
 USE_PINNED = True
 # provide the address/size data for GPU-direct addresses. Saves ~11us per iteration
 USE_ADDRESS_OPTIMIZATION = True
+LOW_ITER_THRESHOLD = 8192
 WARMUP_ITERS = 10
 
 class Ping(Chare):
@@ -144,7 +145,7 @@ def main(args):
     msg_size = min_msg_size
 
     while msg_size <= max_msg_size:
-        if msg_size <= 1048576:
+        if msg_size <= LOW_ITER_THRESHOLD:
             iter = low_iter
         else:
             iter = high_iter

From 8ef56e5ec14a9a29279bea9b1cff290d761de499 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 25 Jan 2021 13:37:31 -0500
Subject: [PATCH 040/107] add hooks to cuda copy functions

---
 charm4py/charmlib/ccharm.pxd          | 10 ++++++----
 charm4py/charmlib/charmlib_cython.pyx | 12 +++++++++++-
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index ce06826d..e11c204f 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -1,5 +1,7 @@
 
 # libcharm wrapper for Cython
+cdef extern from "cuda_runtime.h":
+  ctypedef long cudaStream_t
 
 cdef extern from "charm.h":
 
@@ -89,6 +91,10 @@ cdef extern from "charm.h":
 
     void registerDepositFutureWithIdFn(void (*cb)(void*, void*));
 
+    void CkCUDAHtoD(void *dest, void *src, int nbytes, cudaStream_t stream);
+    void CkCUDADtoH(void *dest, void *src, int nbytes, cudaStream_t stream);
+    void CkCUDAStreamSynchronize(cudaStream_t stream);
+
 
 
 cdef extern from "spanningTree.h":
@@ -96,7 +102,3 @@ cdef extern from "spanningTree.h":
                             int *parent, int *child_count, int **children);
 
 
-# cdef extern from "ckrdmadevice.h":
-#     cdef cppclass CkDeviceBuffer:
-#         CkDeviceBuffer()
-
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 2eb7cde4..c360c264 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -957,6 +957,17 @@ class CharmLib(object):
                        <void*>stream_ptrs.data.as_voidptr,
                        future_id
                        )
+  def CudaHtoD(self, long destAddr, long srcAddr, int nbytes, long streamAddr):
+      CkCUDAHtoD(<void*>destAddr, <void*>srcAddr,<int>nbytes, (<cudaStream_t*> streamAddr)[0]);
+
+  def CudaDtoH(self, long destAddr, long srcAddr, int nbytes, long streamAddr):
+    CkCUDADtoH(<void*>destAddr, <void*>srcAddr,<int>int(nbytes), (<cudaStream_t*> streamAddr)[0]);
+
+  def CudaStreamSynchronize(self, long streamAddr):
+    CkCUDAStreamSynchronize((<cudaStream_t*>streamAddr)[0])
+
+
+
 
 # first callback from Charm++ shared library
 cdef void registerMainModule():
@@ -1028,7 +1039,6 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD
     except:
       charm.handleGeneralError()
 
-
 cdef void recvArrayBcast(int aid, int ndims, int nInts, int numElems, int *arrayIndexes, int ep, int msgSize, char *msg, int dcopy_start):
   cdef int i = 0
   try:

From f058ab31f084067b34a163eb525b965bfcc5baef Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 25 Jan 2021 13:42:37 -0500
Subject: [PATCH 041/107] don't use slow Numba transfer functionality when
 using host-staging

---
 tests/benchmark/pingpong_gpu.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 8a65dcca..5bfafd6e 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -48,7 +48,15 @@ def do_iteration(self, message_size, num_iters, done_future):
         partner = self.thisProxy[partner_idx]
         partner_channel = Channel(self, partner)
 
-        if USE_ADDRESS_OPTIMIZATION:
+        my_stream = cuda.stream()
+        stream_address = my_stream.handle.value
+        d_data_send_addr = d_data_send.__cuda_array_interface__['data'][0]
+        h_data_send_addr = h_data_send.__array_interface__['data'][0]
+
+        d_data_recv_addr = d_data_recv.__cuda_array_interface__['data'][0]
+        h_data_recv_addr = h_data_recv.__array_interface__['data'][0]
+
+        if self.gpu_direct and USE_ADDRESS_OPTIMIZATION:
             d_data_recv_addr = array.array('L', [0])
             d_data_recv_size = array.array('L', [0])
             d_data_send_addr = array.array('L', [0])
@@ -67,9 +75,14 @@ def do_iteration(self, message_size, num_iters, done_future):
                 tstart = time.time()
             if self.am_low_chare:
                 if not self.gpu_direct:
-                    d_data_send.copy_to_host(h_data_send)
+                    charm.lib.CudaDtoH(h_data_send_addr, d_data_send_addr, message_size, stream_address)
+                    charm.lib.CudaStreamSynchronize(stream_address)
+
                     partner_channel.send(h_data_send)
-                    d_data_recv.copy_to_device(partner_channel.recv())
+                    received = partner_channel.recv()
+
+                    charm.lib.CudaHtoD(d_data_recv_addr, received.__array_interface__['data'][0], message_size, stream_address)
+                    charm.lib.CudaStreamSynchronize(stream_address)
                 else:
                     if USE_ADDRESS_OPTIMIZATION:
                         partner_channel.send(gpu_src_ptrs = d_data_send_addr, gpu_src_sizes = d_data_send_size)
@@ -81,8 +94,12 @@ def do_iteration(self, message_size, num_iters, done_future):
                         partner_channel.recv(d_data_recv)
             else:
                 if not self.gpu_direct:
-                    d_data_recv.copy_to_device(partner_channel.recv())
-                    d_data_send.copy_to_host(h_data_send)
+                    received = partner_channel.recv()
+
+                    charm.lib.CudaHtoD(d_data_recv_addr, received.__array_interface__['data'][0], message_size, stream_address)
+                    charm.lib.CudaDtoH(h_data_send_addr, d_data_send_addr, message_size, stream_address)
+                    charm.lib.CudaStreamSynchronize(stream_address)
+
                     partner_channel.send(h_data_send)
                 else:
                     if USE_ADDRESS_OPTIMIZATION:

From 6cad70bd885b1c97f276f111a9e0d39c62434b00 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 25 Jan 2021 15:24:17 -0500
Subject: [PATCH 042/107] host-staging bandwidth test

---
 tests/benchmark/bandwidth.py | 132 +++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 tests/benchmark/bandwidth.py

diff --git a/tests/benchmark/bandwidth.py b/tests/benchmark/bandwidth.py
new file mode 100644
index 00000000..a576d6a2
--- /dev/null
+++ b/tests/benchmark/bandwidth.py
@@ -0,0 +1,132 @@
+from charm4py import charm, Chare, Array, coro, Future, Channel, Group, ArrayMap
+import time
+import numpy as np
+from numba import cuda
+import array
+
+USE_PINNED = True
+# provide the address/size data for GPU-direct addresses. Saves ~11us per iteration
+USE_ADDRESS_OPTIMIZATION = True
+LOW_ITER_THRESHOLD = 8192
+WARMUP_ITERS = 10
+
+
+class Block(Chare):
+    def __init__(self, use_gpudirect):
+        self.gpu_direct = use_gpudirect
+        self.num_chares = charm.numPes()
+        self.am_low_chare = self.thisIndex[0] == 0
+
+        if self.am_low_chare:
+            print("Msg Size, Iterations, Bandwidth (MB/s)")
+
+    @coro
+    def do_iteration(self, message_size, windows, num_iters, done_future):
+        if USE_PINNED:
+            h_local_data = cuda.pinned_array(message_size, dtype='int8')
+            h_remote_data = cuda.pinned_array(message_size, dtype='int8')
+        else:
+            if self.am_low_chare:
+                h_local_data = np.ones(message_size, dtype='int8')
+                h_remote_data = np.ones(message_size, dtype='int8')
+            else:
+                h_local_data = np.zeros(message_size, dtype='int8')
+                h_remote_data = np.zeros(message_size, dtype='int8')
+
+
+        d_local_data = cuda.device_array(message_size, dtype='int8')
+        d_remote_data = cuda.device_array(message_size, dtype='int8')
+
+        my_stream = cuda.stream()
+        stream_address = my_stream.handle.value
+
+        d_local_data_addr = d_local_data.__cuda_array_interface__['data'][0]
+        h_local_data_addr = h_local_data.__array_interface__['data'][0]
+
+        d_remote_data_addr = d_remote_data.__cuda_array_interface__['data'][0]
+        h_remote_data_addr = h_remote_data.__array_interface__['data'][0]
+
+        partner_idx = int(not self.thisIndex[0])
+        partner = self.thisProxy[partner_idx]
+        partner_channel = Channel(self, partner)
+
+        tstart = 0
+
+        for idx in range(num_iters + WARMUP_ITERS):
+            if idx == WARMUP_ITERS:
+                tstart = time.time()
+            if self.am_low_chare:
+                if not self.gpu_direct:
+                    for _ in range(windows):
+                        charm.lib.CudaDtoH(h_local_data_addr, d_local_data_addr, message_size, stream_address)
+                    charm.lib.CudaStreamSynchronize(stream_address)
+                    for _ in range(windows):
+                        partner_channel.send(h_local_data)
+                    partner_channel.recv()
+                else:
+                    pass
+            else:
+                if not self.gpu_direct:
+                    for _ in range(windows):
+                        received = partner_channel.recv()
+                        charm.lib.CudaHtoD(d_remote_data_addr, received.__array_interface__['data'][0],
+                                           message_size, stream_address
+                                           )
+                    charm.lib.CudaStreamSynchronize(stream_address)
+                    partner_channel.send(1)
+                else:
+                    pass
+
+        tend = time.time()
+        elapsed_time = tend - tstart
+        if self.am_low_chare:
+            self.display_iteration_data(elapsed_time, num_iters, windows, message_size)
+
+        self.reduce(done_future)
+
+    def display_iteration_data(self, elapsed_time, num_iters, windows, message_size):
+        data_sent = message_size / 1e6 * num_iters * windows;
+        print(f'{message_size},{num_iters},{data_sent/elapsed_time}')
+
+
+
+class ArrMap(ArrayMap):
+    def procNum(self, index):
+        return index[0] % 2
+
+
+def main(args):
+    if len(args) < 7:
+        print("Doesn't have the required input params. Usage:"
+              "<min-msg-size> <max-msg-size> <window-size> "
+              "<low-iter> <high-iter>"
+              "<GPU (0 for host staging, 1 for GPU Direct)>\n"
+              )
+        charm.exit(-1)
+
+    min_msg_size = int(args[1])
+    max_msg_size = int(args[2])
+    window_size = int(args[3])
+    low_iter = int(args[4])
+    high_iter = int(args[5])
+    use_gpudirect = int(args[6])
+
+    peMap = Group(ArrMap)
+    blocks = Array(Block, 2, args=[use_gpudirect], map = peMap)
+    charm.awaitCreation(blocks)
+    msg_size = min_msg_size
+
+    while msg_size <= max_msg_size:
+        if msg_size <= LOW_ITER_THRESHOLD:
+            iter = low_iter
+        else:
+            iter = high_iter
+        done_future = Future()
+        blocks.do_iteration(msg_size, window_size, iter, done_future)
+        done_future.get()
+        msg_size *= 2
+
+    charm.exit()
+
+
+charm.start(main)

From 9e0ff3121a91b3dc884d7c99c89d3b1bdadd9574 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 25 Jan 2021 15:49:12 -0500
Subject: [PATCH 043/107] add gpudirect bw test

---
 tests/benchmark/bandwidth.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/tests/benchmark/bandwidth.py b/tests/benchmark/bandwidth.py
index a576d6a2..318fe849 100644
--- a/tests/benchmark/bandwidth.py
+++ b/tests/benchmark/bandwidth.py
@@ -46,6 +46,14 @@ def do_iteration(self, message_size, windows, num_iters, done_future):
         d_remote_data_addr = d_remote_data.__cuda_array_interface__['data'][0]
         h_remote_data_addr = h_remote_data.__array_interface__['data'][0]
 
+        if self.gpu_direct:
+            d_local_data_addr = array.array('L', [0])
+            d_local_data_size = array.array('L', [0])
+
+            d_local_data_addr[0] = d_local_data.__cuda_array_interface__['data'][0]
+            d_local_data_size[0] = d_local_data.nbytes
+
+
         partner_idx = int(not self.thisIndex[0])
         partner = self.thisProxy[partner_idx]
         partner_channel = Channel(self, partner)
@@ -64,7 +72,11 @@ def do_iteration(self, message_size, windows, num_iters, done_future):
                         partner_channel.send(h_local_data)
                     partner_channel.recv()
                 else:
-                    pass
+                    for _ in range(windows):
+                        partner_channel.send(gpu_src_ptrs = d_local_data_addr,
+                                             gpu_src_sizes = d_local_data_size
+                                             )
+                    partner_channel.recv()
             else:
                 if not self.gpu_direct:
                     for _ in range(windows):
@@ -73,9 +85,11 @@ def do_iteration(self, message_size, windows, num_iters, done_future):
                                            message_size, stream_address
                                            )
                     charm.lib.CudaStreamSynchronize(stream_address)
-                    partner_channel.send(1)
                 else:
-                    pass
+                    for _ in range(windows):
+                        partner_channel.recv(post_buf_addresses = d_local_data_addr,
+                                             post_buf_sizes = d_local_data_size)
+                partner_channel.send(1)
 
         tend = time.time()
         elapsed_time = tend - tstart

From 1752446db21ccb4461e36e58f6c03c3b0546e005 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 25 Jan 2021 19:44:08 -0500
Subject: [PATCH 044/107] remove comment

---
 charm4py/charm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index f1e87523..334ffd6a 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -334,7 +334,6 @@ def recvArrayBcast(self, aid, indexes, ep, msg, dcopy_start):
             self.invokeEntryMethod(array[index], ep, header, args)
 
     def unpackMsg(self, msg, dcopy_start, dest_obj):
-        # Issue Rgets for GPU data in unpackMsg? But how does recv work?
         if msg[:7] == b'_local:':
             header, args = dest_obj.__removeLocal__(int(msg[7:]))
         else:

From 9582b2a85590aa82b1f1f7154ebf6f0c08fac4d0 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 25 Jan 2021 19:44:39 -0500
Subject: [PATCH 045/107] fix benchmark for non gpu-direct

---
 tests/benchmark/bandwidth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmark/bandwidth.py b/tests/benchmark/bandwidth.py
index 318fe849..d2c082dc 100644
--- a/tests/benchmark/bandwidth.py
+++ b/tests/benchmark/bandwidth.py
@@ -70,12 +70,12 @@ def do_iteration(self, message_size, windows, num_iters, done_future):
                     charm.lib.CudaStreamSynchronize(stream_address)
                     for _ in range(windows):
                         partner_channel.send(h_local_data)
-                    partner_channel.recv()
                 else:
                     for _ in range(windows):
                         partner_channel.send(gpu_src_ptrs = d_local_data_addr,
                                              gpu_src_sizes = d_local_data_size
                                              )
+
                     partner_channel.recv()
             else:
                 if not self.gpu_direct:

From 6f955077dfd67fd735f7e5f430e5428fab023fc0 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 25 Jan 2021 20:01:55 -0500
Subject: [PATCH 046/107] separate channels for data/ack

---
 tests/benchmark/bandwidth.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/benchmark/bandwidth.py b/tests/benchmark/bandwidth.py
index d2c082dc..6f013055 100644
--- a/tests/benchmark/bandwidth.py
+++ b/tests/benchmark/bandwidth.py
@@ -57,6 +57,7 @@ def do_iteration(self, message_size, windows, num_iters, done_future):
         partner_idx = int(not self.thisIndex[0])
         partner = self.thisProxy[partner_idx]
         partner_channel = Channel(self, partner)
+        partner_ack_channel = Channel(self, partner)
 
         tstart = 0
 
@@ -68,6 +69,7 @@ def do_iteration(self, message_size, windows, num_iters, done_future):
                     for _ in range(windows):
                         charm.lib.CudaDtoH(h_local_data_addr, d_local_data_addr, message_size, stream_address)
                     charm.lib.CudaStreamSynchronize(stream_address)
+                        # d_local_data.copy_to_host(h_local_data)
                     for _ in range(windows):
                         partner_channel.send(h_local_data)
                 else:
@@ -76,7 +78,7 @@ def do_iteration(self, message_size, windows, num_iters, done_future):
                                              gpu_src_sizes = d_local_data_size
                                              )
 
-                    partner_channel.recv()
+                    partner_ack_channel.recv()
             else:
                 if not self.gpu_direct:
                     for _ in range(windows):
@@ -85,11 +87,12 @@ def do_iteration(self, message_size, windows, num_iters, done_future):
                                            message_size, stream_address
                                            )
                     charm.lib.CudaStreamSynchronize(stream_address)
+                        # d_local_data.copy_to_device(received)
                 else:
                     for _ in range(windows):
                         partner_channel.recv(post_buf_addresses = d_local_data_addr,
                                              post_buf_sizes = d_local_data_size)
-                partner_channel.send(1)
+                partner_ack_channel.send(1)
 
         tend = time.time()
         elapsed_time = tend - tstart

From c62c621c6e01ab2b85bc5fbaf8defd42eced7bce Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 25 Jan 2021 21:02:21 -0500
Subject: [PATCH 047/107] fix indentation of partner_ack

---
 tests/benchmark/bandwidth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmark/bandwidth.py b/tests/benchmark/bandwidth.py
index 6f013055..69da8180 100644
--- a/tests/benchmark/bandwidth.py
+++ b/tests/benchmark/bandwidth.py
@@ -78,7 +78,7 @@ def do_iteration(self, message_size, windows, num_iters, done_future):
                                              gpu_src_sizes = d_local_data_size
                                              )
 
-                    partner_ack_channel.recv()
+                partner_ack_channel.recv()
             else:
                 if not self.gpu_direct:
                     for _ in range(windows):

From 99d4398ee35a49a68051ff60a9fb84525ca084be Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Tue, 26 Jan 2021 12:21:01 -0500
Subject: [PATCH 048/107] initialize jacobi object

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/cuda/gpudirect/jacobi3d/jacobi3d.py

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
new file mode 100644
index 00000000..e69de29b

From 121076d91c54567203cd8ca25a098c150548154b Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Tue, 26 Jan 2021 12:45:04 -0500
Subject: [PATCH 049/107] add CLI arguments

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 52 ++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index e69de29b..5de2cc83 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -0,0 +1,52 @@
+from charm4py import *
+from numba import cuda
+from argparse import ArgumentParser
+from enum import Enum
+
+class Defaults(Enum):
+    GRID_WIDTH = 512,
+    GRID_HEIGHT = 512,
+    GRID_DEPTH = 512,
+    NUM_ITERS = 512,
+    WARMUP_ITERS = 10,
+    USE_ZEROCOPY = False
+    PRINT_ELEMENTS = False
+
+
+def main(args):
+    Defaults.NUM_CHARES = charm.numPes()
+    argp = ArgumentParser(description = "Jacobi3D implementation in Charm4Py using "
+                          "CUDA and GPU-Direct communication"
+                          )
+    argp.add_argument('-x', '--grid_width', help = "Grid width",
+                      default = Defaults.GRID_WIDTH.value
+                      )
+    argp.add_argument('-y', '--grid_height', help = "Grid height",
+                       default = Defaults.GRID_HEIGHT.value
+                       )
+    argp.add_argument('-z', '--grid_depth', help = "Grid depth",
+                      default = Defaults.GRID_DEPTH.value
+                      )
+    argp.add_argument('-c', '--num_chares', help = "Number of chares",
+                       default = Defaults.NUM_CHARES
+                       )
+    argp.add_argument('-i', '--iterations', help = "Number of iterations",
+                      default = Defaults.NUM_ITERS.value
+                      )
+    argp.add_argument('-w', '--warmup_iterations', help = "Number of warmup iterations",
+                      default = Defaults.WARMUP_ITERS.value
+                      )
+    argp.add_argument('-d', '--use_zerocopy', action = "store_true",
+                      help = "Use zerocopy when performing data transfers",
+                      default = Defaults.USE_ZEROCOPY.value
+                      )
+    argp.add_argument('-p', '--print_blocks', help = "Print blocks",
+                      action = "store_true",
+                      default = Defaults.PRINT_ELEMENTS.value
+                      )
+    args = argp.parse_args()
+    # charm.exit()
+
+# charm.start(main)
+if __name__ == '__main__':
+    main(None)

From b202babd1f5890d01ba439ef4230801bfb4f7945 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Tue, 26 Jan 2021 12:52:41 -0500
Subject: [PATCH 050/107] create file containing cuda kernels

---
 examples/cuda/gpudirect/jacobi3d/kernels.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/cuda/gpudirect/jacobi3d/kernels.py

diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py
new file mode 100644
index 00000000..e69de29b

From d22cdfb4c5abff49524a1ef6683522d3fbb1e942 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Tue, 26 Jan 2021 13:12:40 -0500
Subject: [PATCH 051/107] initialized a few kernels

---
 examples/cuda/gpudirect/jacobi3d/kernels.py | 49 +++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py
index e69de29b..cbf43515 100644
--- a/examples/cuda/gpudirect/jacobi3d/kernels.py
+++ b/examples/cuda/gpudirect/jacobi3d/kernels.py
@@ -0,0 +1,49 @@
+from numba import cuda
+from numba.cuda import blockDim, blockIdx, threadIdx
+
+@cuda.jit(device=True)
+def IDX(i,j,k, block_width, block_height):
+    return ((block_width+2)*(block_height+2)*(k)+(block_width+2)*(j)+(i))
+
+@cuda.jit
+def initKernel(temperature, block_width, block_height, block_depth):
+  i = blockDim.x * blockIdx.x + threadIdx.x
+  j = blockDim.y * blockIdx.y + threadIdx.y
+  k = blockDim.z * blockIdx.z + threadIdx.z
+
+  if i < block_width + 2 and j < block_height + 2 and k < block_depth + 2:
+      temperature[IDX(i, j, k, block_width, block_height)] = 0
+
+@cuda.jit
+def ghostInitKernel(ghost, ghost_count):
+    i = blockDim.x * blockIdx.x + threadIdx.x
+    if i < ghost_count:
+        ghost[i] = 0
+
+@cuda.jit
+def leftBoundaryKernel(temperature, block_width, block_height, block_depth):
+  j = blockDim.x * blockIdx.x + threadIdx.x
+  k = blockDim.y * blockIdx.y + threadIdx.y
+  if j < block_height and k < block_depth:
+    temperature[IDX(0,1+j,1+k, block_width, block_height)] = 1;
+
+@cuda.jit
+def rightBoundaryKernel(temperature, block_width, block_height, block_depth):
+    pass
+
+@cuda.jit
+def topBoundaryKernel(temperature, block_width, block_height, block_depth):
+    pass
+
+@cuda.jit
+def bottomBoundaryKernel(temperature, block_width, block_height, block_depth):
+    pass
+
+@cuda.jit
+def frontBoundaryKernel(temperature, block_width, block_height, block_depth):
+    pass
+
+@cuda.jit
+def backBoundaryKernel(temperature, block_width, block_height, block_depth):
+    pass
+

From bc821748d3c3eaf6e0f73dd72a939cbec02179d3 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Tue, 26 Jan 2021 13:12:49 -0500
Subject: [PATCH 052/107] include time as well

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index 5de2cc83..77d62c88 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -2,6 +2,7 @@
 from numba import cuda
 from argparse import ArgumentParser
 from enum import Enum
+import time
 
 class Defaults(Enum):
     GRID_WIDTH = 512,
@@ -45,6 +46,7 @@ def main(args):
                       default = Defaults.PRINT_ELEMENTS.value
                       )
     args = argp.parse_args()
+
     # charm.exit()
 
 # charm.start(main)

From a4ba3cf0a73664628a4f7406b68f460678c335e0 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 09:11:48 -0500
Subject: [PATCH 053/107] include boundary, pack kernels

---
 examples/cuda/gpudirect/jacobi3d/kernels.py | 248 +++++++++++++++++++-
 1 file changed, 243 insertions(+), 5 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py
index cbf43515..b1aec20b 100644
--- a/examples/cuda/gpudirect/jacobi3d/kernels.py
+++ b/examples/cuda/gpudirect/jacobi3d/kernels.py
@@ -1,6 +1,17 @@
 from numba import cuda
 from numba.cuda import blockDim, blockIdx, threadIdx
 
+TILE_SIZE_3D = 8
+TILE_SIZE_2D = 16
+
+LEFT = 0
+RIGHT = 1
+TOP = 2
+BOTTOM = 3
+FRONT = 4
+BACK = 5
+DIR_COUNT = 6
+
 @cuda.jit(device=True)
 def IDX(i,j,k, block_width, block_height):
     return ((block_width+2)*(block_height+2)*(k)+(block_width+2)*(j)+(i))
@@ -29,21 +40,248 @@ def leftBoundaryKernel(temperature, block_width, block_height, block_depth):
 
 @cuda.jit
 def rightBoundaryKernel(temperature, block_width, block_height, block_depth):
-    pass
+        j = blockDim.x*blockIdx.x+threadIdx.x
+        k = blockDim.y*blockIdx.y+threadIdx.y
+        if j < block_height and k < block_depth:
+            temperature[IDX(block_width+1,1+j,1+k, block_width, block_height)] = 1;
+
 
 @cuda.jit
 def topBoundaryKernel(temperature, block_width, block_height, block_depth):
-    pass
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    k = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and k < block_depth:
+        temperature[IDX(1+i,0,1+k, block_width, block_height)] = 1
+
 
 @cuda.jit
 def bottomBoundaryKernel(temperature, block_width, block_height, block_depth):
-    pass
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    k = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and k < block_depth:
+          temperature[IDX(1+i,block_height+1,1+k, block_width, block_height)] = 1
 
 @cuda.jit
 def frontBoundaryKernel(temperature, block_width, block_height, block_depth):
-    pass
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    j = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and j < block_height:
+        temperature[IDX(1+i,1+j,0, block_width, block_height)] = 1;
+
 
 @cuda.jit
 def backBoundaryKernel(temperature, block_width, block_height, block_depth):
-    pass
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    j = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and j < block_height:
+          temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = 1
+
+@cuda.jit
+def jacobiKernel(temp, new_temp, block_width, block_height, block_depth):
+    i = (blockDim.x*blockIdx.x+threadIdx.x)+1
+    j = (blockDim.y*blockIdx.y+threadIdx.y)+1
+    k = (blockDim.z*blockIdx.z+threadIdx.z)+1
+
+  if (i <= block_width && j <= block_height && k <= block_depth):
+      new_temperature[IDX(i,j,k, block_width, block_height)] =
+              (temperature[IDX(i,j,k, block_width, block_height)] +
+               temperature[IDX(i-1,j,k, block_width, block_height)] +
+               temperature[IDX(i+1,j,k, block_width, block_height)] +
+               temperature[IDX(i,j-1,k, block_width, block_height)] +
+               temperature[IDX(i,j+1,k, block_width, block_height)] +
+               temperature[IDX(i,j,k-1, block_width, block_height)] +
+               temperature[IDX(i,j,k+1, block_width, block_height)]) *
+              0.142857 # equivalent to dividing by 7
+
+@cuda.jit
+def leftPackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    j = blockDim.x*blockIdx.x+threadIdx.x;
+    k = blockDim.y*blockIdx.y+threadIdx.y;
+    if j < block_height and k < block_depth:
+          ghost[block_height*k+j] =
+          temperature[IDX(1,1+j,1+k, block_width, block_height)]
+
+@cuda.jit
+def rightPackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    j = blockDim.x*blockIdx.x+threadIdx.x
+    k = blockDim.y*blockIdx.y+threadIdx.y
+    if j < block_height and k < block_depth:
+        ghost[block_height*k+j] =
+        temperature[IDX(1,1+j,1+k, block_width, block_height)]
+  }
+
+
+@cuda.jit
+def topPackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    k = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and k < block_depth:
+        ghost[block_width*k+i] =
+        temperature[IDX(1+i,1,1+k, block_width, block_height)]
+
+@cuda.jit
+def bottomPackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    k = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and k < block_depth:
+        ghost[block_width*k+i] =
+        temperature[IDX(1+i,block_height,1+k, block_width, block_height)];
+  }
+
+@cuda.jit
+def frontPackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    j = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and j < block_height:
+        temperature[IDX(1+i,1+j,0, block_width, block_height)] =
+        ghost[block_width*j+i]
+
+@cuda.jit
+def backPackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    j = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and j < block_height:
+        temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] =
+        ghost[block_width*j+i]
+
+
+@cuda.jit
+def leftUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    j = blockDim.x*blockIdx.x+threadIdx.x
+    k = blockDim.y*blockIdx.y+threadIdx.y
+    if j < block_height and k < block_depth:
+        temperature[IDX(0,1+j,1+k, block_width, block_height)] = ghost[block_height*k+j]
+
+
+
+@cuda.jit
+def rightUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    j = blockDim.x*blockIdx.x+threadIdx.x
+    k = blockDim.y*blockIdx.y+threadIdx.y
+    if j < block_height and k < block_depth:
+        temperature[IDX(block_width+1,1+j,1+k,  block_width, block_height)] = ghost[block_height*k+j]
+
+@cuda.jit
+def topUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    k = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and k < block_depth:
+        temperature[IDX(1+i,0,1+k,  block_width, block_height)] = ghost[block_width*k+i]
+
+@cuda.jit
+def bottomUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    k = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and k < block_depth:
+        temperature[IDX(1+i,block_height+1,1+k,  block_width, block_height)] = ghost[block_width*k+i]
+
+@cuda.jit
+def frontUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    j = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and j < block_height:
+        temperature[IDX(1+i,1+j,0,  block_width, block_height)] = ghost[block_width*j+i]
+
+@cuda.jit
+def backUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
+    i = blockDim.x*blockIdx.x+threadIdx.x
+    j = blockDim.y*blockIdx.y+threadIdx.y
+    if i < block_width and j < block_height:
+        temperature[IDX(1+i,1+j,block_depth+1,  block_width, block_height)] = ghost[block_width*j+i]
+
+def invokeInitKernel(temp_dev_array, block_width, block_height, block_depth, stream):
+    block_dim = (TILE_SIZE_3D, TILE_SIZE_3D, TILE_SIZE_3D)
+    grid_dim = (((block_width+2)+(block_dim[0]-1))/block_dim[0], # x
+      ((block_height+2)+(block_dim[1]-1))/block_dim[1], # y
+      ((block_depth+2)+(block_dim[2]-1))/block_dim[2]) # z
+
+    initKernel[grid_dim, block_dim, stream](temp_dev_array,
+                                            block_width, block_height,
+                                            block_depth)
+
+
+def invokeGhostInitKernels(ghosts, ghost_counts, stream):
+    #TODO: this fn will probably have to change if the ghosts/counts can't
+    # be transferred automatically
+    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#dim3
+    block_dim = (256, 1, 1)
+    dim3 block_dim(256);
+    for i in range(len(ghosts)):
+        ghost = ghosts[i]
+        ghost_count = ghost_counts[i]
+        grid_dim = (ghost_count+block_dim[0]-1)//block_dim[0], 1, 1)
+
+        ghostInitKernel[grid_dim, block_dim, stream](ghosts, ghost_count)
+
+def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, bounds, stream):
+    block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1)
+
+    if bounds(LEFT):
+        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        leftBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
+                                                        block_width,
+                                                        block_height,
+                                                        block_depth
+                                                        )
+    if bounds(RIGHT):
+        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        rightBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
+                                                         block_width,
+                                                         block_height,
+                                                         block_depth
+                                                         )
+
+    if bounds(TOP):
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                      (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        topBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
+                                                      block_width,
+                                                      block_height,
+                                                      block_depth
+                                                      )
+
+    if bounds(BOTTOM):
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        bottomBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
+                                                          block_width,
+                                                          block_height,
+                                                          block_depth
+                                                          )
+
+    if bounds(FRONT):
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        frontBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
+                                                         block_width,
+                                                         block_height,
+                                                         block_depth
+                                                         )
+
+    if bounds(BACK):
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        backBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
+                                                        block_width,
+                                                        block_height,
+                                                        block_depth
+                                                        )
+
+
+def invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_height, block_depth, stream):
+    block_dim = (TILE_SIZE_3D, TILE_SIZE_3D, TILE_SIZE_3D)
+    grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                (block_height+(block_dim.y-1))/block_dim.y,
+                (block_depth+(block_dim.z-1))/block_dim.z)
+
+    jacobiKernel[grid_dim, block_dim, stream](d_temperature,
+                                              d_new_temperature,
+                                              block_width,
+                                              block_height,
+                                              block_depth
+                                              )
+
+
 

From 7652d80c582355faa37006d5820b26698bb6a4f9 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 09:37:34 -0500
Subject: [PATCH 054/107] invoke pack/unpack kernels

---
 examples/cuda/gpudirect/jacobi3d/kernels.py | 114 ++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py
index b1aec20b..213974bd 100644
--- a/examples/cuda/gpudirect/jacobi3d/kernels.py
+++ b/examples/cuda/gpudirect/jacobi3d/kernels.py
@@ -284,4 +284,118 @@ def invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_heig
                                               )
 
 
+def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth, stream):
+    block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1)
+
+    if dir == LEFT:
+        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        leftPackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                       d_ghost,
+                                                       block_width,
+                                                       block_height,
+                                                       block_depth
+                                                       )
+    elif dir == RIGHT:
+        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        rightPackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                        d_ghost,
+                                                        block_width,
+                                                        block_height,
+                                                        block_depth
+                                                        )
+    elif dir == TOP:
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        topPackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                      d_ghost,
+                                                      block_width,
+                                                      block_height,
+                                                      block_depth
+                                                      )
+    elif dir == BOTTOM:
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        bottomPackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                         d_ghost,
+                                                         block_width,
+                                                         block_height,
+                                                         block_depth
+                                                         )
+    elif dir == FRONT:
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        frontPackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                        d_ghost,
+                                                        block_width,
+                                                        block_height,
+                                                        block_depth
+                                                        )
+    elif dir == BACK:
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        backPackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                       d_ghost,
+                                                       block_width,
+                                                       block_height,
+                                                       block_depth
+                                                       )
+
+def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth, stream):
+    block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1)
 
+    if dir == LEFT:
+        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        leftUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                         d_ghost,
+                                                         block_width,
+                                                         block_height,
+                                                         block_depth
+                                                         )
+    if dir == RIGHT:
+        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        rightUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                          d_ghost,
+                                                          block_width,
+                                                          block_height,
+                                                          block_depth
+                                                          )
+    if dir == TOP:
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        topUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                        d_ghost,
+                                                        block_width,
+                                                        block_height,
+                                                        block_depth
+                                                        )
+    if dir == BOTTOM:
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        bottomUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                           d_ghost,
+                                                           block_width,
+                                                           block_height,
+                                                           block_depth
+                                                           )
+    if dir == FRONT:
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        frontUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                          d_ghost,
+                                                          block_width,
+                                                          block_height,
+                                                          block_depth
+                                                          )
+    if dir == BACK:
+        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
+                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        backUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
+                                                         d_ghost,
+                                                         block_width,
+                                                         block_height,
+                                                         block_depth
+                                                         )

From 8a7253ea66fc2e3d7b87f1f2e51d1a3a8f95dfcf Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 10:26:11 -0500
Subject: [PATCH 055/107] use integer division

---
 examples/cuda/gpudirect/jacobi3d/kernels.py | 84 ++++++++++-----------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py
index 213974bd..bf4d159c 100644
--- a/examples/cuda/gpudirect/jacobi3d/kernels.py
+++ b/examples/cuda/gpudirect/jacobi3d/kernels.py
@@ -191,9 +191,9 @@ def backUnpackingKernel(temperature, ghost, block_width, block_height, block_dep
 
 def invokeInitKernel(temp_dev_array, block_width, block_height, block_depth, stream):
     block_dim = (TILE_SIZE_3D, TILE_SIZE_3D, TILE_SIZE_3D)
-    grid_dim = (((block_width+2)+(block_dim[0]-1))/block_dim[0], # x
-      ((block_height+2)+(block_dim[1]-1))/block_dim[1], # y
-      ((block_depth+2)+(block_dim[2]-1))/block_dim[2]) # z
+    grid_dim = (((block_width+2)+(block_dim[0]-1))//block_dim[0], # x
+      ((block_height+2)+(block_dim[1]-1))//block_dim[1], # y
+      ((block_depth+2)+(block_dim[2]-1))//block_dim[2]) # z
 
     initKernel[grid_dim, block_dim, stream](temp_dev_array,
                                             block_width, block_height,
@@ -217,16 +217,16 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth,
     block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1)
 
     if bounds(LEFT):
-        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         leftBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                         block_width,
                                                         block_height,
                                                         block_depth
                                                         )
     if bounds(RIGHT):
-        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         rightBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                          block_width,
                                                          block_height,
@@ -234,8 +234,8 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth,
                                                          )
 
     if bounds(TOP):
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                      (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                      (block_depth+(block_dim.y-1))//block_dim.y, 1)
         topBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                       block_width,
                                                       block_height,
@@ -243,8 +243,8 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth,
                                                       )
 
     if bounds(BOTTOM):
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         bottomBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                           block_width,
                                                           block_height,
@@ -252,8 +252,8 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth,
                                                           )
 
     if bounds(FRONT):
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_height+(block_dim.y-1))//block_dim.y, 1)
         frontBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                          block_width,
                                                          block_height,
@@ -261,8 +261,8 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth,
                                                          )
 
     if bounds(BACK):
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_height+(block_dim.y-1))//block_dim.y, 1)
         backBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                         block_width,
                                                         block_height,
@@ -272,9 +272,9 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth,
 
 def invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_height, block_depth, stream):
     block_dim = (TILE_SIZE_3D, TILE_SIZE_3D, TILE_SIZE_3D)
-    grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                (block_height+(block_dim.y-1))/block_dim.y,
-                (block_depth+(block_dim.z-1))/block_dim.z)
+    grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                (block_height+(block_dim.y-1))//block_dim.y,
+                (block_depth+(block_dim.z-1))//block_dim.z)
 
     jacobiKernel[grid_dim, block_dim, stream](d_temperature,
                                               d_new_temperature,
@@ -288,8 +288,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
     block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1)
 
     if dir == LEFT:
-        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         leftPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                        d_ghost,
                                                        block_width,
@@ -297,8 +297,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
                                                        block_depth
                                                        )
     elif dir == RIGHT:
-        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         rightPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                         d_ghost,
                                                         block_width,
@@ -306,8 +306,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
                                                         block_depth
                                                         )
     elif dir == TOP:
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         topPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                       d_ghost,
                                                       block_width,
@@ -315,8 +315,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
                                                       block_depth
                                                       )
     elif dir == BOTTOM:
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         bottomPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                          d_ghost,
                                                          block_width,
@@ -324,8 +324,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
                                                          block_depth
                                                          )
     elif dir == FRONT:
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_height+(block_dim.y-1))//block_dim.y, 1)
         frontPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                         d_ghost,
                                                         block_width,
@@ -333,8 +333,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
                                                         block_depth
                                                         )
     elif dir == BACK:
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_height+(block_dim.y-1))//block_dim.y, 1)
         backPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                        d_ghost,
                                                        block_width,
@@ -346,8 +346,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
     block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1)
 
     if dir == LEFT:
-        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         leftUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                          d_ghost,
                                                          block_width,
@@ -355,8 +355,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
                                                          block_depth
                                                          )
     if dir == RIGHT:
-        grid_dim = ((block_height+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         rightUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                           d_ghost,
                                                           block_width,
@@ -364,8 +364,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
                                                           block_depth
                                                           )
     if dir == TOP:
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         topUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                         d_ghost,
                                                         block_width,
@@ -373,8 +373,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
                                                         block_depth
                                                         )
     if dir == BOTTOM:
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_depth+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
         bottomUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                            d_ghost,
                                                            block_width,
@@ -382,8 +382,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
                                                            block_depth
                                                            )
     if dir == FRONT:
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_height+(block_dim.y-1))//block_dim.y, 1)
         frontUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                           d_ghost,
                                                           block_width,
@@ -391,8 +391,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
                                                           block_depth
                                                           )
     if dir == BACK:
-        grid_dim = ((block_width+(block_dim.x-1))/block_dim.x,
-                    (block_height+(block_dim.y-1))/block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
+                    (block_height+(block_dim.y-1))//block_dim.y, 1)
         backUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                          d_ghost,
                                                          block_width,

From 0d614be6030eeb75354f080bbb429f694b58648d Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 10:53:48 -0500
Subject: [PATCH 056/107] add program initialization

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 72 +++++++++++++++++++-
 1 file changed, 70 insertions(+), 2 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index 77d62c88..1c9ec39a 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -1,7 +1,9 @@
 from charm4py import *
 from numba import cuda
+import numpy as np
 from argparse import ArgumentParser
 from enum import Enum
+from functools import reduce
 import time
 
 class Defaults(Enum):
@@ -16,7 +18,7 @@ class Defaults(Enum):
 
 def main(args):
     Defaults.NUM_CHARES = charm.numPes()
-    argp = ArgumentParser(description = "Jacobi3D implementation in Charm4Py using "
+    argp = ArgumentParser(description ="Jacobi3D implementation in Charm4Py using "
                           "CUDA and GPU-Direct communication"
                           )
     argp.add_argument('-x', '--grid_width', help = "Grid width",
@@ -47,7 +49,73 @@ def main(args):
                       )
     args = argp.parse_args()
 
-    # charm.exit()
+    num_chares_per_dim = calc_num_chares_per_dim(num_chares,
+                                                                 grid_width,
+                                                                 grid_height,
+                                                                 grid_depth
+                                                                 )
+    n_chares_x, n_chares_y, n_chares_z = num_chares_per_dim
+
+    if reduce(lambda x, y: x*y, n_chares_per_dim) != num_chares:
+        print(f"ERROR: Bad grid of chares: {n_chares_x} x {n_chares_y} x "
+              f"{n_chares_z} != {num_chares}"
+              )
+        charm.exit(-1)
+
+    # Calculate block size
+    block_width = grid_width // n_chares_x
+    block_height = grid_height // n_chares_y
+    block_depth = grid_depth // n_chares_z
+
+    # Calculate surf count, sizes
+    x_surf_count = block_height * block_depth
+    y_surf_count = block_width * block_depth
+    z_surf_count = block_width * block_height
+    x_surf_size = x_surf_count * np.dtype(np.float64).itemsize
+    y_surf_size = y_surf_count * np.dtype(np.float64).itemsize
+    z_surf_size = z_surf_count * np.dtype(np.float64).itemsize
+
+
+    # print configuration
+    print("\n[CUDA 3D Jacobi example]n")
+    print(f"Grid: {grid_width} x {grid_height} x {grid_depth}, "
+          f"Block: {block_width} x {block_height} x {block_depth}, "
+          f"Chares: {n_chares_x} x {n_chares_y} x {n_chares_z}, "
+          f"Iterations: {n_iters}, Warm-up: {warmup_iters}, "
+          f"Zerocopy: {use_zerocopy}, Print: {print_elements}\n\n",
+          )
+
+
+def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d):
+    n_chares = [0, 0, 0]
+    area = [0.0, 0.0, 0.0]
+    area[0] = grid_w * grid_h
+    area[1] = grid_w * grid_d
+    area[2] = grid_h * grid_d
+
+    bestsurf = 2.0 * sum(area)
+
+    ipx = 1
+
+    while ipx <= num_chares:
+        if not num_chares % ipx:
+            nremain = num_chares // ipx
+            ipy = 1
+
+            while ipy <= nremain:
+                if not nremain % ipy:
+                    ipz = nremain // ipy
+                    surf = area[0] / ipx / ipy + area[1] / ipz + area[2] / ipy / ipz
+
+                    if surf < bestsuf:
+                        bestsurf = surf
+                        n_chares[0] = ipx
+                        n_chares[1] = ipy
+                        n_chares[2] = ipz
+                ipy += 1
+        ipx += 1
+
+    return n_chares
 
 # charm.start(main)
 if __name__ == '__main__':

From cf97f37c2515872f56d9c752ecbc6a9871bb748a Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 11:22:51 -0500
Subject: [PATCH 057/107] globals are now broadcast to chares

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 33 +++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index 1c9ec39a..2dc689fe 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -85,6 +85,37 @@ def main(args):
           f"Zerocopy: {use_zerocopy}, Print: {print_elements}\n\n",
           )
 
+    charm.thisProxy.updateGlobals({'num_chares': num_chares,
+                                   'grid_width': grid_width,
+                                   'grid_height': grid_height,
+                                   'grid_depth': grid_depth,
+                                   'block_width': block_width,
+                                   'block_height': block_height,
+                                   'block_depth': block_depth,
+                                   'x_surf_count': x_surf_count,
+                                   'y_surf_count': y_surf_count,
+                                   'z_surf_count': z_surf_count,
+                                   'x_surf_size': x_surf_size,
+                                   'y_surf_size': y_surf_size,
+                                   'z_surf_size': z_surf_size,
+                                   'n_chares_x': n_chares_x,
+                                   'n_chares_y': n_chares_y,
+                                   'n_chares_z': n_chares_z,
+                                   'n_iters': n_iters,
+                                   'warmup_iters': warmup_iters,
+                                   'use_zerocopy': use_zerocopy,
+                                   'print_elements': print_elements
+                                   }, awaitable = True, module_name = 'block'
+                                  ).get()
+
+    init_done_future = Future()
+    block_proxy = Array(Block,
+                        dims=[n_chares_x, n_chares_y, n_chares_z],
+                        args = init_done_future
+                        )
+    init_done_future.get()
+    charm.exit()
+
 
 def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d):
     n_chares = [0, 0, 0]
@@ -117,6 +148,6 @@ def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d):
 
     return n_chares
 
-# charm.start(main)
+# charm.start(main, modules = ['block'])
 if __name__ == '__main__':
     main(None)

From 9cdd749775fe6522fe6e5454df0908282dac93cb Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 11:42:17 -0500
Subject: [PATCH 058/107] fix Block import, turn args into vars

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index 2dc689fe..b0489b7b 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -4,6 +4,7 @@
 from argparse import ArgumentParser
 from enum import Enum
 from functools import reduce
+from block import Block
 import time
 
 class Defaults(Enum):
@@ -47,7 +48,17 @@ def main(args):
                       action = "store_true",
                       default = Defaults.PRINT_ELEMENTS.value
                       )
-    args = argp.parse_args()
+    args, _ = argp.parse_known_args()
+
+    grid_width = args.grid_width
+    grid_height = args.grid_height
+    grid_depth = args.grid_depth
+    num_chares = args.num_chares
+    iterations = args.iterations
+    warmup_iterations = args.warmup_iterations
+    use_zerocopy = args.use_zerocopy
+    print_blocks = args.print_blocks
+
 
     num_chares_per_dim = calc_num_chares_per_dim(num_chares,
                                                                  grid_width,
@@ -148,6 +159,6 @@ def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d):
 
     return n_chares
 
-# charm.start(main, modules = ['block'])
-if __name__ == '__main__':
-    main(None)
+charm.start(main, modules = ['block'])
+# if __name__ == '__main__':
+    # main(None)

From 123d54fb268542a41d63a11e717e12b30fa14a53 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 11:43:50 -0500
Subject: [PATCH 059/107] enum members no longer tuple

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index b0489b7b..d2fca790 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -8,11 +8,11 @@
 import time
 
 class Defaults(Enum):
-    GRID_WIDTH = 512,
-    GRID_HEIGHT = 512,
-    GRID_DEPTH = 512,
-    NUM_ITERS = 512,
-    WARMUP_ITERS = 10,
+    GRID_WIDTH = 512
+    GRID_HEIGHT = 512
+    GRID_DEPTH = 512
+    NUM_ITERS = 512
+    WARMUP_ITERS = 10
     USE_ZEROCOPY = False
     PRINT_ELEMENTS = False
 
@@ -61,10 +61,10 @@ def main(args):
 
 
     num_chares_per_dim = calc_num_chares_per_dim(num_chares,
-                                                                 grid_width,
-                                                                 grid_height,
-                                                                 grid_depth
-                                                                 )
+                                                 grid_width,
+                                                 grid_height,
+                                                 grid_depth
+                                                 )
     n_chares_x, n_chares_y, n_chares_z = num_chares_per_dim
 
     if reduce(lambda x, y: x*y, n_chares_per_dim) != num_chares:
@@ -131,6 +131,7 @@ def main(args):
 def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d):
     n_chares = [0, 0, 0]
     area = [0.0, 0.0, 0.0]
+    print(grid_w, grid_h, grid_d)
     area[0] = grid_w * grid_h
     area[1] = grid_w * grid_d
     area[2] = grid_h * grid_d

From 731c96a96b540dfbb987ce453a84fc8a5513adac Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 11:44:10 -0500
Subject: [PATCH 060/107] enum members no longer tuple

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index d2fca790..11a0d9c7 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -128,7 +128,7 @@ def main(args):
     charm.exit()
 
 
-def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d):
+def calc_num_chares_per_dim(num_chares, grid_w, grid_h, grid_d):
     n_chares = [0, 0, 0]
     area = [0.0, 0.0, 0.0]
     print(grid_w, grid_h, grid_d)

From cc079019a5219895dbb354006a8c93fa8653ead0 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 11:47:32 -0500
Subject: [PATCH 061/107] fix some names, print statements

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index 11a0d9c7..2efa842e 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -54,10 +54,10 @@ def main(args):
     grid_height = args.grid_height
     grid_depth = args.grid_depth
     num_chares = args.num_chares
-    iterations = args.iterations
-    warmup_iterations = args.warmup_iterations
+    n_iters = args.iterations
+    warmup_iters = args.warmup_iterations
     use_zerocopy = args.use_zerocopy
-    print_blocks = args.print_blocks
+    print_elements = args.print_blocks
 
 
     num_chares_per_dim = calc_num_chares_per_dim(num_chares,
@@ -67,7 +67,7 @@ def main(args):
                                                  )
     n_chares_x, n_chares_y, n_chares_z = num_chares_per_dim
 
-    if reduce(lambda x, y: x*y, n_chares_per_dim) != num_chares:
+    if reduce(lambda x, y: x*y, num_chares_per_dim) != num_chares:
         print(f"ERROR: Bad grid of chares: {n_chares_x} x {n_chares_y} x "
               f"{n_chares_z} != {num_chares}"
               )
@@ -88,7 +88,7 @@ def main(args):
 
 
     # print configuration
-    print("\n[CUDA 3D Jacobi example]n")
+    print("\n[CUDA 3D Jacobi example]\n")
     print(f"Grid: {grid_width} x {grid_height} x {grid_depth}, "
           f"Block: {block_width} x {block_height} x {block_depth}, "
           f"Chares: {n_chares_x} x {n_chares_y} x {n_chares_z}, "
@@ -122,7 +122,7 @@ def main(args):
     init_done_future = Future()
     block_proxy = Array(Block,
                         dims=[n_chares_x, n_chares_y, n_chares_z],
-                        args = init_done_future
+                        args = [init_done_future]
                         )
     init_done_future.get()
     charm.exit()
@@ -131,7 +131,6 @@ def main(args):
 def calc_num_chares_per_dim(num_chares, grid_w, grid_h, grid_d):
     n_chares = [0, 0, 0]
     area = [0.0, 0.0, 0.0]
-    print(grid_w, grid_h, grid_d)
     area[0] = grid_w * grid_h
     area[1] = grid_w * grid_d
     area[2] = grid_h * grid_d
@@ -150,7 +149,7 @@ def calc_num_chares_per_dim(num_chares, grid_w, grid_h, grid_d):
                     ipz = nremain // ipy
                     surf = area[0] / ipx / ipy + area[1] / ipz + area[2] / ipy / ipz
 
-                    if surf < bestsuf:
+                    if surf < bestsurf:
                         bestsurf = surf
                         n_chares[0] = ipx
                         n_chares[1] = ipy

From 29baab2228febe9c1cc3db98bdeb8f71005bd96f Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 12:37:01 -0500
Subject: [PATCH 062/107] add default types for arguments

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index 2efa842e..6e4c4306 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -23,21 +23,27 @@ def main(args):
                           "CUDA and GPU-Direct communication"
                           )
     argp.add_argument('-x', '--grid_width', help = "Grid width",
+                      type = int,
                       default = Defaults.GRID_WIDTH.value
                       )
     argp.add_argument('-y', '--grid_height', help = "Grid height",
+                      type = int,
                        default = Defaults.GRID_HEIGHT.value
                        )
     argp.add_argument('-z', '--grid_depth', help = "Grid depth",
+                      type = int,
                       default = Defaults.GRID_DEPTH.value
                       )
     argp.add_argument('-c', '--num_chares', help = "Number of chares",
+                      type = int,
                        default = Defaults.NUM_CHARES
                        )
     argp.add_argument('-i', '--iterations', help = "Number of iterations",
+                      type = int,
                       default = Defaults.NUM_ITERS.value
                       )
     argp.add_argument('-w', '--warmup_iterations', help = "Number of warmup iterations",
+                      type = int,
                       default = Defaults.WARMUP_ITERS.value
                       )
     argp.add_argument('-d', '--use_zerocopy', action = "store_true",
@@ -48,6 +54,8 @@ def main(args):
                       action = "store_true",
                       default = Defaults.PRINT_ELEMENTS.value
                       )
+
+    # only parse the known args because argparse sees the Charm++ args for some reason
     args, _ = argp.parse_known_args()
 
     grid_width = args.grid_width
@@ -160,5 +168,3 @@ def calc_num_chares_per_dim(num_chares, grid_w, grid_h, grid_d):
     return n_chares
 
 charm.start(main, modules = ['block'])
-# if __name__ == '__main__':
-    # main(None)

From fbd08ccdafd6f64345f255b3a86797bf7d2ec57d Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 12:37:14 -0500
Subject: [PATCH 063/107] fix python syntax errors

---
 examples/cuda/gpudirect/jacobi3d/kernels.py | 38 +++++++++------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py
index bf4d159c..36d64581 100644
--- a/examples/cuda/gpudirect/jacobi3d/kernels.py
+++ b/examples/cuda/gpudirect/jacobi3d/kernels.py
@@ -82,15 +82,15 @@ def jacobiKernel(temp, new_temp, block_width, block_height, block_depth):
     j = (blockDim.y*blockIdx.y+threadIdx.y)+1
     k = (blockDim.z*blockIdx.z+threadIdx.z)+1
 
-  if (i <= block_width && j <= block_height && k <= block_depth):
-      new_temperature[IDX(i,j,k, block_width, block_height)] =
-              (temperature[IDX(i,j,k, block_width, block_height)] +
-               temperature[IDX(i-1,j,k, block_width, block_height)] +
-               temperature[IDX(i+1,j,k, block_width, block_height)] +
-               temperature[IDX(i,j-1,k, block_width, block_height)] +
-               temperature[IDX(i,j+1,k, block_width, block_height)] +
-               temperature[IDX(i,j,k-1, block_width, block_height)] +
-               temperature[IDX(i,j,k+1, block_width, block_height)]) *
+    if (i <= block_width and j <= block_height and k <= block_depth):
+        new_temperature[IDX(i,j,k, block_width, block_height)] = \
+              (temperature[IDX(i,j,k, block_width, block_height)] + \
+               temperature[IDX(i-1,j,k, block_width, block_height)] + \
+               temperature[IDX(i+1,j,k, block_width, block_height)] + \
+               temperature[IDX(i,j-1,k, block_width, block_height)] + \
+               temperature[IDX(i,j+1,k, block_width, block_height)] + \
+               temperature[IDX(i,j,k-1, block_width, block_height)] + \
+               temperature[IDX(i,j,k+1, block_width, block_height)]) * \
               0.142857 # equivalent to dividing by 7
 
 @cuda.jit
@@ -98,7 +98,7 @@ def leftPackingKernel(temperature, ghost, block_width, block_height, block_depth
     j = blockDim.x*blockIdx.x+threadIdx.x;
     k = blockDim.y*blockIdx.y+threadIdx.y;
     if j < block_height and k < block_depth:
-          ghost[block_height*k+j] =
+          ghost[block_height*k+j] = \
           temperature[IDX(1,1+j,1+k, block_width, block_height)]
 
 @cuda.jit
@@ -106,17 +106,15 @@ def rightPackingKernel(temperature, ghost, block_width, block_height, block_dept
     j = blockDim.x*blockIdx.x+threadIdx.x
     k = blockDim.y*blockIdx.y+threadIdx.y
     if j < block_height and k < block_depth:
-        ghost[block_height*k+j] =
+        ghost[block_height*k+j] = \
         temperature[IDX(1,1+j,1+k, block_width, block_height)]
-  }
-
 
 @cuda.jit
 def topPackingKernel(temperature, ghost, block_width, block_height, block_depth):
     i = blockDim.x*blockIdx.x+threadIdx.x
     k = blockDim.y*blockIdx.y+threadIdx.y
     if i < block_width and k < block_depth:
-        ghost[block_width*k+i] =
+        ghost[block_width*k+i] = \
         temperature[IDX(1+i,1,1+k, block_width, block_height)]
 
 @cuda.jit
@@ -124,16 +122,15 @@ def bottomPackingKernel(temperature, ghost, block_width, block_height, block_dep
     i = blockDim.x*blockIdx.x+threadIdx.x
     k = blockDim.y*blockIdx.y+threadIdx.y
     if i < block_width and k < block_depth:
-        ghost[block_width*k+i] =
+        ghost[block_width*k+i] = \
         temperature[IDX(1+i,block_height,1+k, block_width, block_height)];
-  }
 
 @cuda.jit
 def frontPackingKernel(temperature, ghost, block_width, block_height, block_depth):
     i = blockDim.x*blockIdx.x+threadIdx.x
     j = blockDim.y*blockIdx.y+threadIdx.y
     if i < block_width and j < block_height:
-        temperature[IDX(1+i,1+j,0, block_width, block_height)] =
+        temperature[IDX(1+i,1+j,0, block_width, block_height)] = \
         ghost[block_width*j+i]
 
 @cuda.jit
@@ -141,8 +138,8 @@ def backPackingKernel(temperature, ghost, block_width, block_height, block_depth
     i = blockDim.x*blockIdx.x+threadIdx.x
     j = blockDim.y*blockIdx.y+threadIdx.y
     if i < block_width and j < block_height:
-        temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] =
-        ghost[block_width*j+i]
+        temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = \
+            ghost[block_width*j+i]
 
 
 @cuda.jit
@@ -205,11 +202,10 @@ def invokeGhostInitKernels(ghosts, ghost_counts, stream):
     # be transferred automatically
     # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#dim3
     block_dim = (256, 1, 1)
-    dim3 block_dim(256);
     for i in range(len(ghosts)):
         ghost = ghosts[i]
         ghost_count = ghost_counts[i]
-        grid_dim = (ghost_count+block_dim[0]-1)//block_dim[0], 1, 1)
+        grid_dim = ((ghost_count+block_dim[0]-1)//block_dim[0], 1, 1)
 
         ghostInitKernel[grid_dim, block_dim, stream](ghosts, ghost_count)
 

From b5d7bff914b5d52563c5323cc9bcc349718bac2f Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 12:37:33 -0500
Subject: [PATCH 064/107] include chare file

---
 examples/cuda/gpudirect/jacobi3d/block.py | 54 +++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 examples/cuda/gpudirect/jacobi3d/block.py

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
new file mode 100644
index 00000000..24839058
--- /dev/null
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -0,0 +1,54 @@
+from charm4py import *
+import kernels
+
+class Block(Chare):
+    def __init__(self, init_done_future):
+        self.my_iter = 0
+        self.neighbors = 0
+        self.remote_count = 0
+        self.x = self.thisIndex[0]
+        self.y = self.thisIndex[1]
+        self.z = self.thisIndex[2]
+
+        self.bounds = [False] * kernels.DIR_COUNT
+        self.init_bounds(self.x, self.y, self.z)
+
+        self.h_ghosts = []
+        self.d_ghosts = []
+        self.d_send_ghosts = []
+        self.d_recv_ghosts = []
+        self.d_ghosts_addr = []
+        self.d_send_ghosts_addr = []
+        self.d_recv_ghosts_addr = []
+
+        self.reduce(init_done_future)
+
+    def init_bounds(self, x, y, z):
+        neighbors = 0
+
+        if x == 0:
+            self.bounds[kernels.LEFT] = True
+        else:
+            neighbors += 1
+        if x == n_chares_x - 1:
+            self.bounds[kernels.RIGHT] = True
+        else:
+            neighbors += 1
+        if y == 0:
+            self.bounds[kernels.TOP] = True
+        else:
+            neighbors += 1
+        if y == n_chares_y - 1:
+            self.bounds[kernels.BOTTOM] = True
+        else:
+            neighbors += 1
+        if z == 0:
+            self.bounds[kernels.FRONT] = True
+        else:
+            neighbors += 1
+        if z == n_chares_z - 1:
+            self.bounds[kernels.BACK] = True
+        else:
+            neighbors += 1
+
+        self.neighbors = neighbors

From e4338c824245d074f2e7660cd5c0ad1d913be194 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 14:26:54 -0500
Subject: [PATCH 065/107] fix spelling, corrected tuple access

---
 examples/cuda/gpudirect/jacobi3d/kernels.py | 181 ++++++++++----------
 1 file changed, 90 insertions(+), 91 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py
index 36d64581..823e4d44 100644
--- a/examples/cuda/gpudirect/jacobi3d/kernels.py
+++ b/examples/cuda/gpudirect/jacobi3d/kernels.py
@@ -1,5 +1,4 @@
 from numba import cuda
-from numba.cuda import blockDim, blockIdx, threadIdx
 
 TILE_SIZE_3D = 8
 TILE_SIZE_2D = 16
@@ -18,69 +17,69 @@ def IDX(i,j,k, block_width, block_height):
 
 @cuda.jit
 def initKernel(temperature, block_width, block_height, block_depth):
-  i = blockDim.x * blockIdx.x + threadIdx.x
-  j = blockDim.y * blockIdx.y + threadIdx.y
-  k = blockDim.z * blockIdx.z + threadIdx.z
+  i = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
+  j = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
+  k = cuda.blockDim.z * cuda.blockIdx.z + cuda.threadIdx.z
 
   if i < block_width + 2 and j < block_height + 2 and k < block_depth + 2:
       temperature[IDX(i, j, k, block_width, block_height)] = 0
 
 @cuda.jit
 def ghostInitKernel(ghost, ghost_count):
-    i = blockDim.x * blockIdx.x + threadIdx.x
+    i = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
     if i < ghost_count:
         ghost[i] = 0
 
 @cuda.jit
 def leftBoundaryKernel(temperature, block_width, block_height, block_depth):
-  j = blockDim.x * blockIdx.x + threadIdx.x
-  k = blockDim.y * blockIdx.y + threadIdx.y
+  j = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x
+  k = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y
   if j < block_height and k < block_depth:
     temperature[IDX(0,1+j,1+k, block_width, block_height)] = 1;
 
 @cuda.jit
 def rightBoundaryKernel(temperature, block_width, block_height, block_depth):
-        j = blockDim.x*blockIdx.x+threadIdx.x
-        k = blockDim.y*blockIdx.y+threadIdx.y
+        j = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+        k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
         if j < block_height and k < block_depth:
             temperature[IDX(block_width+1,1+j,1+k, block_width, block_height)] = 1;
 
 
 @cuda.jit
 def topBoundaryKernel(temperature, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    k = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and k < block_depth:
         temperature[IDX(1+i,0,1+k, block_width, block_height)] = 1
 
 
 @cuda.jit
 def bottomBoundaryKernel(temperature, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    k = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and k < block_depth:
           temperature[IDX(1+i,block_height+1,1+k, block_width, block_height)] = 1
 
 @cuda.jit
 def frontBoundaryKernel(temperature, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    j = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and j < block_height:
         temperature[IDX(1+i,1+j,0, block_width, block_height)] = 1;
 
 
 @cuda.jit
 def backBoundaryKernel(temperature, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    j = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and j < block_height:
           temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = 1
 
 @cuda.jit
 def jacobiKernel(temp, new_temp, block_width, block_height, block_depth):
-    i = (blockDim.x*blockIdx.x+threadIdx.x)+1
-    j = (blockDim.y*blockIdx.y+threadIdx.y)+1
-    k = (blockDim.z*blockIdx.z+threadIdx.z)+1
+    i = (cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x)+1
+    j = (cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y)+1
+    k = (cuda.blockDim.z*cuda.blockIdx.z+cuda.threadIdx.z)+1
 
     if (i <= block_width and j <= block_height and k <= block_depth):
         new_temperature[IDX(i,j,k, block_width, block_height)] = \
@@ -95,48 +94,48 @@ def jacobiKernel(temp, new_temp, block_width, block_height, block_depth):
 
 @cuda.jit
 def leftPackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    j = blockDim.x*blockIdx.x+threadIdx.x;
-    k = blockDim.y*blockIdx.y+threadIdx.y;
+    j = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x;
+    k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y;
     if j < block_height and k < block_depth:
           ghost[block_height*k+j] = \
           temperature[IDX(1,1+j,1+k, block_width, block_height)]
 
 @cuda.jit
 def rightPackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    j = blockDim.x*blockIdx.x+threadIdx.x
-    k = blockDim.y*blockIdx.y+threadIdx.y
+    j = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if j < block_height and k < block_depth:
         ghost[block_height*k+j] = \
         temperature[IDX(1,1+j,1+k, block_width, block_height)]
 
 @cuda.jit
 def topPackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    k = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and k < block_depth:
         ghost[block_width*k+i] = \
         temperature[IDX(1+i,1,1+k, block_width, block_height)]
 
 @cuda.jit
 def bottomPackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    k = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and k < block_depth:
         ghost[block_width*k+i] = \
         temperature[IDX(1+i,block_height,1+k, block_width, block_height)];
 
 @cuda.jit
 def frontPackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    j = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and j < block_height:
         temperature[IDX(1+i,1+j,0, block_width, block_height)] = \
         ghost[block_width*j+i]
 
 @cuda.jit
 def backPackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    j = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and j < block_height:
         temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = \
             ghost[block_width*j+i]
@@ -144,8 +143,8 @@ def backPackingKernel(temperature, ghost, block_width, block_height, block_depth
 
 @cuda.jit
 def leftUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    j = blockDim.x*blockIdx.x+threadIdx.x
-    k = blockDim.y*blockIdx.y+threadIdx.y
+    j = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if j < block_height and k < block_depth:
         temperature[IDX(0,1+j,1+k, block_width, block_height)] = ghost[block_height*k+j]
 
@@ -153,36 +152,36 @@ def leftUnpackingKernel(temperature, ghost, block_width, block_height, block_dep
 
 @cuda.jit
 def rightUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    j = blockDim.x*blockIdx.x+threadIdx.x
-    k = blockDim.y*blockIdx.y+threadIdx.y
+    j = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if j < block_height and k < block_depth:
         temperature[IDX(block_width+1,1+j,1+k,  block_width, block_height)] = ghost[block_height*k+j]
 
 @cuda.jit
 def topUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    k = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and k < block_depth:
         temperature[IDX(1+i,0,1+k,  block_width, block_height)] = ghost[block_width*k+i]
 
 @cuda.jit
 def bottomUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    k = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and k < block_depth:
         temperature[IDX(1+i,block_height+1,1+k,  block_width, block_height)] = ghost[block_width*k+i]
 
 @cuda.jit
 def frontUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    j = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and j < block_height:
         temperature[IDX(1+i,1+j,0,  block_width, block_height)] = ghost[block_width*j+i]
 
 @cuda.jit
 def backUnpackingKernel(temperature, ghost, block_width, block_height, block_depth):
-    i = blockDim.x*blockIdx.x+threadIdx.x
-    j = blockDim.y*blockIdx.y+threadIdx.y
+    i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x
+    j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y
     if i < block_width and j < block_height:
         temperature[IDX(1+i,1+j,block_depth+1,  block_width, block_height)] = ghost[block_width*j+i]
 
@@ -207,58 +206,58 @@ def invokeGhostInitKernels(ghosts, ghost_counts, stream):
         ghost_count = ghost_counts[i]
         grid_dim = ((ghost_count+block_dim[0]-1)//block_dim[0], 1, 1)
 
-        ghostInitKernel[grid_dim, block_dim, stream](ghosts, ghost_count)
+        ghostInitKernel[grid_dim, block_dim, stream](ghost, ghost_count)
 
 def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, bounds, stream):
     block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1)
 
-    if bounds(LEFT):
-        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+    if bounds[LEFT]:
+        grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         leftBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                         block_width,
                                                         block_height,
                                                         block_depth
                                                         )
-    if bounds(RIGHT):
-        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+    if bounds[RIGHT]:
+        grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         rightBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                          block_width,
                                                          block_height,
                                                          block_depth
                                                          )
 
-    if bounds(TOP):
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                      (block_depth+(block_dim.y-1))//block_dim.y, 1)
+    if bounds[TOP]:
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                      (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         topBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                       block_width,
                                                       block_height,
                                                       block_depth
                                                       )
 
-    if bounds(BOTTOM):
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+    if bounds[BOTTOM]:
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         bottomBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                           block_width,
                                                           block_height,
                                                           block_depth
                                                           )
 
-    if bounds(FRONT):
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_height+(block_dim.y-1))//block_dim.y, 1)
+    if bounds[FRONT]:
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_height+(block_dim[1]-1))//block_dim[1], 1)
         frontBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                          block_width,
                                                          block_height,
                                                          block_depth
                                                          )
 
-    if bounds(BACK):
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_height+(block_dim.y-1))//block_dim.y, 1)
+    if bounds[BACK]:
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_height+(block_dim[1]-1))//block_dim[1], 1)
         backBoundaryKernel[grid_dim, block_dim, stream](d_temperature,
                                                         block_width,
                                                         block_height,
@@ -268,9 +267,9 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth,
 
 def invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_height, block_depth, stream):
     block_dim = (TILE_SIZE_3D, TILE_SIZE_3D, TILE_SIZE_3D)
-    grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                (block_height+(block_dim.y-1))//block_dim.y,
-                (block_depth+(block_dim.z-1))//block_dim.z)
+    grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                (block_height+(block_dim[1]-1))//block_dim[1],
+                (block_depth+(block_dim[2]-1))//block_dim[2])
 
     jacobiKernel[grid_dim, block_dim, stream](d_temperature,
                                               d_new_temperature,
@@ -280,12 +279,12 @@ def invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_heig
                                               )
 
 
-def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth, stream):
+def invokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth, stream):
     block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1)
 
     if dir == LEFT:
-        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         leftPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                        d_ghost,
                                                        block_width,
@@ -293,8 +292,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
                                                        block_depth
                                                        )
     elif dir == RIGHT:
-        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         rightPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                         d_ghost,
                                                         block_width,
@@ -302,8 +301,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
                                                         block_depth
                                                         )
     elif dir == TOP:
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         topPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                       d_ghost,
                                                       block_width,
@@ -311,8 +310,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
                                                       block_depth
                                                       )
     elif dir == BOTTOM:
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         bottomPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                          d_ghost,
                                                          block_width,
@@ -320,8 +319,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
                                                          block_depth
                                                          )
     elif dir == FRONT:
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_height+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_height+(block_dim[1]-1))//block_dim[1], 1)
         frontPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                         d_ghost,
                                                         block_width,
@@ -329,8 +328,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height,
                                                         block_depth
                                                         )
     elif dir == BACK:
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_height+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_height+(block_dim[1]-1))//block_dim[1], 1)
         backPackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                        d_ghost,
                                                        block_width,
@@ -342,8 +341,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
     block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1)
 
     if dir == LEFT:
-        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         leftUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                          d_ghost,
                                                          block_width,
@@ -351,8 +350,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
                                                          block_depth
                                                          )
     if dir == RIGHT:
-        grid_dim = ((block_height+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         rightUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                           d_ghost,
                                                           block_width,
@@ -360,8 +359,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
                                                           block_depth
                                                           )
     if dir == TOP:
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         topUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                         d_ghost,
                                                         block_width,
@@ -369,8 +368,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
                                                         block_depth
                                                         )
     if dir == BOTTOM:
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_depth+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_depth+(block_dim[1]-1))//block_dim[1], 1)
         bottomUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                            d_ghost,
                                                            block_width,
@@ -378,8 +377,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
                                                            block_depth
                                                            )
     if dir == FRONT:
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_height+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_height+(block_dim[1]-1))//block_dim[1], 1)
         frontUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                           d_ghost,
                                                           block_width,
@@ -387,8 +386,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height
                                                           block_depth
                                                           )
     if dir == BACK:
-        grid_dim = ((block_width+(block_dim.x-1))//block_dim.x,
-                    (block_height+(block_dim.y-1))//block_dim.y, 1)
+        grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0],
+                    (block_height+(block_dim[1]-1))//block_dim[1], 1)
         backUnpackingKernel[grid_dim, block_dim, stream](d_temperature,
                                                          d_ghost,
                                                          block_width,

From 6e076606a8f74b0d9f2eabb71b76347e129515f7 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 14:27:30 -0500
Subject: [PATCH 066/107] add chare initialization

---
 examples/cuda/gpudirect/jacobi3d/block.py | 129 ++++++++++++++++++++--
 1 file changed, 121 insertions(+), 8 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
index 24839058..476537ae 100644
--- a/examples/cuda/gpudirect/jacobi3d/block.py
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -1,6 +1,18 @@
 from charm4py import *
+import array
+from numba import cuda
+import numpy as np
 import kernels
 
+def getArrayAddress(arr):
+    return arr.__cuda__array_interface__['data'][0]
+
+def getArraySize(arr):
+    return arr.nbytes
+
+def getArrayData(arr):
+    return (getArrayAddress(arr), getArraySize(arr))
+
 class Block(Chare):
     def __init__(self, init_done_future):
         self.my_iter = 0
@@ -9,20 +21,121 @@ def __init__(self, init_done_future):
         self.x = self.thisIndex[0]
         self.y = self.thisIndex[1]
         self.z = self.thisIndex[2]
+        self.ghost_sizes = (x_surf_size, x_surf_size,
+                            y_surf_size, y_surf_size,
+                            z_surf_size, z_surf_size
+                            )
+
+        self.ghost_counts = (x_surf_count, x_surf_count,
+                             y_surf_count, y_surf_count,
+                             z_surf_count, z_surf_count
+                             )
 
         self.bounds = [False] * kernels.DIR_COUNT
-        self.init_bounds(self.x, self.y, self.z)
 
-        self.h_ghosts = []
-        self.d_ghosts = []
-        self.d_send_ghosts = []
-        self.d_recv_ghosts = []
-        self.d_ghosts_addr = []
-        self.d_send_ghosts_addr = []
-        self.d_recv_ghosts_addr = []
+        empty = lambda x: [0] * x
+
+        self.h_temperature = None
+        self.d_temperature = None
+        self.d_new_temperature = None
+        self.h_ghosts = empty(kernels.DIR_COUNT)
+        self.d_ghosts = empty(kernels.DIR_COUNT)
+        self.d_send_ghosts = empty(kernels.DIR_COUNT)
+        self.d_recv_ghosts = empty(kernels.DIR_COUNT)
+        self.d_ghosts_addr = empty(kernels.DIR_COUNT)
+        self.d_send_ghosts_addr = empty(kernels.DIR_COUNT)
+        self.d_recv_ghosts_addr = empty(kernels.DIR_COUNT)
+        self.d_send_ghosts_size = empty(kernels.DIR_COUNT)
+        self.d_recv_ghotss_size = empty(kernels.DIR_COUNT)
+
+        self.stream = cuda.default_stream()
+
+        self.init()
 
         self.reduce(init_done_future)
 
+    def init(self):
+        self.init_bounds(self.x, self.y, self.z)
+        self.init_device_data()
+
+    def init_device_data(self):
+        temp_size = (block_width+2) * (block_height+2) * (block_depth+2)
+        self.h_temperature = cuda.pinned_array(temp_size, dtype=np.float64)
+        self.d_temperature = cuda.device_array(temp_size, dtype=np.float64)
+        self.d_new_temperature = cuda.device_array(temp_size, dtype=np.float64)
+
+        if use_zerocopy:
+            for i in range(kernels.DIR_COUNT):
+                self.d_send_ghosts[i] = cuda.device_array(self.ghost_sizes[i],
+                                                          dtype=np.float64
+                                                          )
+                self.d_recv_ghosts[i] = cuda.device_array(self.ghost_sizes[i],
+                                                          dtype=np.float64
+                                                          )
+
+                d_send_data = getArrayData(d_send_ghosts)
+                d_recv_data = getArrayData(d_send_ghosts)
+
+                d_send_addr = array.array('L', [d_send_data[0]])
+                d_recv_addr = array.array('L', [d_recv_data[0]])
+
+                d_send_size = array.array('L', [d_send_data[1]])
+                d_recv_size = array.array('L', [d_recv_data[1]])
+
+                self.d_send_ghosts_addr[i] = d_send_addr
+                self.d_recv_ghosts_addr[i] = d_recv_addr
+
+                self.d_send_ghosts_size[i] = d_send_size
+                self.d_recv_ghosts_size[i] = d_recv_size
+        else:
+            for i in range(kernels.DIR_COUNT):
+                self.h_ghosts[i] = cuda.pinned_array(self.ghost_sizes[i],
+                                                     dtype=np.float64
+                                                     )
+                self.d_ghosts[i] = cuda.device_array(self.ghost_sizes[i],
+                                                     dtype=np.float64
+                                                     )
+
+        kernels.invokeInitKernel(self.d_temperature, block_width, block_height, block_depth,
+                                 self.stream
+                                 )
+        kernels.invokeInitKernel(self.d_new_temperature, block_width, block_height, block_depth,
+                                 self.stream
+                                 )
+        if use_zerocopy:
+            kernels.invokeGhostInitKernels(self.d_send_ghosts,
+                                           self.ghost_counts,
+                                           self.stream
+                                           )
+            kernels.invokeGhostInitKernels(self.d_recv_ghosts,
+                                           self.ghost_counts,
+                                           self.stream
+                                           )
+        else:
+            kernels.invokeGhostInitKernels(self.d_ghosts,
+                                           self.ghost_counts,
+                                           self.stream
+                                           )
+            for i in range(kernels.DIR_COUNT):
+                self.h_ghosts[i].fill(0)
+
+        kernels.invokeBoundaryKernels(self.d_temperature,
+                                      block_width,
+                                      block_height,
+                                      block_depth,
+                                      self.bounds,
+                                      self.stream
+                                      )
+        kernels.invokeBoundaryKernels(self.d_new_temperature,
+                                      block_width,
+                                      block_height,
+                                      block_depth,
+                                      self.bounds,
+                                      self.stream
+                                      )
+        self.stream.synchronize()
+
+
     def init_bounds(self, x, y, z):
         neighbors = 0
 

From b2d9c6e99c70d7bf6b6700117c80d58a7efb06c9 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 14:40:19 -0500
Subject: [PATCH 067/107] add neighbor channel initialization

---
 examples/cuda/gpudirect/jacobi3d/block.py | 31 +++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
index 476537ae..86c3d359 100644
--- a/examples/cuda/gpudirect/jacobi3d/block.py
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -21,6 +21,7 @@ def __init__(self, init_done_future):
         self.x = self.thisIndex[0]
         self.y = self.thisIndex[1]
         self.z = self.thisIndex[2]
+
         self.ghost_sizes = (x_surf_size, x_surf_size,
                             y_surf_size, y_surf_size,
                             z_surf_size, z_surf_size
@@ -35,6 +36,8 @@ def __init__(self, init_done_future):
 
         empty = lambda x: [0] * x
 
+        self.neighbor_channels = empty(kernels.DIR_COUNT)
+
         self.h_temperature = None
         self.d_temperature = None
         self.d_new_temperature = None
@@ -57,6 +60,34 @@ def __init__(self, init_done_future):
     def init(self):
         self.init_bounds(self.x, self.y, self.z)
         self.init_device_data()
+        self.init_neighbor_channels()
+
+    def init_neighbor_channels(self):
+        n_channels = self.neighbors
+
+        if not self.bounds[kernels.LEFT]:
+            new_c = Channel(self, self.thisProxy[(self.x-1, self.y, self.z)])
+            self.neighbor_channels[kernels.LEFT] = new_c
+
+        if not self.bounds[kernels.RIGHT]:
+            new_c = Channel(self, self.thisProxy[(self.x+1, self.y, self.z)])
+            self.neighbor_channels[kernels.RIGHT] = new_c
+
+        if not self.bounds[kernels.TOP]:
+            new_c = Channel(self, self.thisProxy[(self.x, self.y-1, self.z)])
+            self.neighbor_channels[kernels.TOP] = new_c
+
+        if not self.bounds[kernels.BOTTOM]:
+            new_c = Channel(self, self.thisProxy[(self.x, self.y+1, self.z)])
+            self.neighbor_channels[kernels.BOTTOM] = new_c
+
+        if not self.bounds[kernels.FRONT]:
+            new_c = Channel(self, self.thisProxy[(self.x, self.y, self.z-1)])
+            self.neighbor_channels[kernels.FRONT] = new_c
+
+        if not self.bounds[kernels.BACK]:
+            new_c = Channel(self, self.thisProxy[(self.x, self.y, self.z+1)])
+            self.neighbor_channels[kernels.BACK] = new_c
 
     def init_device_data(self):
         temp_size = (block_width+2) * (block_height+2) * (block_depth+2)

From 457a9fa249bfc7378366e2b7a710ebfeab79f0ec Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 16:14:23 -0500
Subject: [PATCH 068/107] finish impl

---
 examples/cuda/gpudirect/jacobi3d/block.py | 131 +++++++++++++++++++++-
 1 file changed, 127 insertions(+), 4 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
index 86c3d359..9a87e350 100644
--- a/examples/cuda/gpudirect/jacobi3d/block.py
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -2,10 +2,11 @@
 import array
 from numba import cuda
 import numpy as np
+import time
 import kernels
 
 def getArrayAddress(arr):
-    return arr.__cuda__array_interface__['data'][0]
+    return arr.__cuda_array_interface__['data'][0]
 
 def getArraySize(arr):
     return arr.nbytes
@@ -37,6 +38,7 @@ def __init__(self, init_done_future):
         empty = lambda x: [0] * x
 
         self.neighbor_channels = empty(kernels.DIR_COUNT)
+        self.acive_neighbor_channels = None
 
         self.h_temperature = None
         self.d_temperature = None
@@ -49,7 +51,7 @@ def __init__(self, init_done_future):
         self.d_send_ghosts_addr = empty(kernels.DIR_COUNT)
         self.d_recv_ghosts_addr = empty(kernels.DIR_COUNT)
         self.d_send_ghosts_size = empty(kernels.DIR_COUNT)
-        self.d_recv_ghotss_size = empty(kernels.DIR_COUNT)
+        self.d_recv_ghosts_size = empty(kernels.DIR_COUNT)
 
         self.stream = cuda.default_stream()
 
@@ -64,30 +66,46 @@ def init(self):
 
     def init_neighbor_channels(self):
         n_channels = self.neighbors
+        active_neighbors = []
 
         if not self.bounds[kernels.LEFT]:
             new_c = Channel(self, self.thisProxy[(self.x-1, self.y, self.z)])
             self.neighbor_channels[kernels.LEFT] = new_c
+            # NOTE: we are adding the member 'recv_direction' to this channel!!!
+            new_c.recv_direction = kernels.LEFT
+            active_neighbors.append(new_c)
 
         if not self.bounds[kernels.RIGHT]:
             new_c = Channel(self, self.thisProxy[(self.x+1, self.y, self.z)])
             self.neighbor_channels[kernels.RIGHT] = new_c
+            new_c.recv_direction = kernels.RIGHT
+            active_neighbors.append(new_c)
 
         if not self.bounds[kernels.TOP]:
             new_c = Channel(self, self.thisProxy[(self.x, self.y-1, self.z)])
             self.neighbor_channels[kernels.TOP] = new_c
+            new_c.recv_direction = kernels.TOP
+            active_neighbors.append(new_c)
 
         if not self.bounds[kernels.BOTTOM]:
             new_c = Channel(self, self.thisProxy[(self.x, self.y+1, self.z)])
             self.neighbor_channels[kernels.BOTTOM] = new_c
+            new_c.recv_direction = kernels.BOTTOM
+            active_neighbors.append(new_c)
 
         if not self.bounds[kernels.FRONT]:
             new_c = Channel(self, self.thisProxy[(self.x, self.y, self.z-1)])
             self.neighbor_channels[kernels.FRONT] = new_c
+            new_c.recv_direction = kernels.FRONT
+            active_neighbors.append(new_c)
 
         if not self.bounds[kernels.BACK]:
             new_c = Channel(self, self.thisProxy[(self.x, self.y, self.z+1)])
             self.neighbor_channels[kernels.BACK] = new_c
+            new_c.recv_direction = kernels.BACK
+            active_neighbors.append(new_c)
+
+        self.active_neighbor_channels = active_neighbors
 
     def init_device_data(self):
         temp_size = (block_width+2) * (block_height+2) * (block_depth+2)
@@ -104,8 +122,8 @@ def init_device_data(self):
                                                           dtype=np.float64
                                                           )
 
-                d_send_data = getArrayData(d_send_ghosts)
-                d_recv_data = getArrayData(d_send_ghosts)
+                d_send_data = getArrayData(self.d_send_ghosts[i])
+                d_recv_data = getArrayData(self.d_send_ghosts[i])
 
                 d_send_addr = array.array('L', [d_send_data[0]])
                 d_recv_addr = array.array('L', [d_recv_data[0]])
@@ -196,3 +214,108 @@ def init_bounds(self, x, y, z):
             neighbors += 1
 
         self.neighbors = neighbors
+
+
+    @coro
+    def sendGhosts(self):
+        for dir in range(kernels.DIR_COUNT):
+            if not self.bounds[dir]:
+                self.sendGhost(dir)
+
+    def updateAndPack(self):
+        kernels.invokeJacobiKernel(self.d_temperature,
+                                   self.d_new_temperature,
+                                   block_width,
+                                   block_height,
+                                   block_depth,
+                                   self.stream
+                                   )
+
+        for i in range(kernels.DIR_COUNT):
+            if not self.bounds[i]:
+                ghosts = self.d_send_ghosts[i] if use_zerocopy else self.d_ghosts[i]
+
+                kernels.invokePackingKernel(self.d_temperature,
+                                            ghosts,
+                                            i,
+                                            block_width,
+                                            block_height,
+                                            block_depth,
+                                            self.stream
+                                            )
+                if not use_zerocopy:
+                    # TODO: change this to the CUDA hooks in charmlib
+                    self.d_ghosts[i].copy_to_host(self.h_ghosts[i])
+        self.stream.synchronize()
+
+
+    @coro
+    def sendGhost(self, direction):
+        send_ch = self.neighbor_channels[direction]
+
+        if use_zerocopy:
+            send_ch.send(gpu_src_ptrs = self.d_send_ghosts_addr[direction],
+                         gpu_src_sizes = self.d_send_ghosts_size[direction]
+                         )
+        else:
+            send_ch.send(self.h_ghosts[direction])
+
+    @coro
+    def recvGhosts(self):
+        for ch in charm.iwait(self.active_neighbor_channels):
+            # remember: we set 'recv_direction' member
+            # directly in the initialization phase
+            neighbor_idx = ch.recv_direction
+
+            if use_zerocopy:
+                ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[neighbor_idx],
+                        post_buf_sizes = self.d_recv_ghosts_size[neighbor_idx]
+                        )
+            else:
+                self.h_ghosts[neighbor_idx] = ch.recv()
+                self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx],
+                                                           stream=self.stream
+                                                           )
+
+            kernels.invokeUnpackingKernel(self.d_temperature,
+                                          self.d_ghosts[neighbor_idx],
+                                          ch.recv_direction,
+                                          block_width,
+                                          block_height,
+                                          block_depth,
+                                          self.stream
+                                          )
+        self.stream.synchronize()
+
+    @coro
+    def exchangeGhosts(self):
+        self.d_temperature, self.d_new_temperature = \
+            self.d_new_temperature, self.d_temperature
+
+        self.sendGhosts()
+        self.recvGhosts()
+
+    @coro
+    def run(self, done_future):
+        tstart = time.time()
+        comm_time = 0
+        for current_iter in range(n_iters + warmup_iters):
+            if current_iter == warmup_iters:
+                tstart = time.time()
+
+            self.my_iter = current_iter
+            self.updateAndPack()
+
+            comm_start_time = time.time()
+
+            self.exchangeGhosts()
+
+            if current_iter >= warmup_iters:
+                comm_time += time.time() - comm_start_time
+
+
+        tend = time.time()
+
+        if self.thisIndex == (0, 0, 0):
+            print(f'Elapsed time: {tend-tstart}')
+        self.reduce(done_future)

From a65c2dd73f0be23aa733da34a71e1f0af10c453e Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 16:14:35 -0500
Subject: [PATCH 069/107] chares now run

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index 6e4c4306..2160ed7e 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -133,6 +133,10 @@ def main(args):
                         args = [init_done_future]
                         )
     init_done_future.get()
+
+    run_future = Future()
+    block_proxy.run(run_future)
+    run_future.get()
     charm.exit()
 
 

From e5c6aa09695da53e3c6340ce92fb9d9ca9c68486 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 16:14:49 -0500
Subject: [PATCH 070/107] fix mispelled var

---
 examples/cuda/gpudirect/jacobi3d/kernels.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py
index 823e4d44..ce489757 100644
--- a/examples/cuda/gpudirect/jacobi3d/kernels.py
+++ b/examples/cuda/gpudirect/jacobi3d/kernels.py
@@ -76,7 +76,7 @@ def backBoundaryKernel(temperature, block_width, block_height, block_depth):
           temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = 1
 
 @cuda.jit
-def jacobiKernel(temp, new_temp, block_width, block_height, block_depth):
+def jacobiKernel(temperature, new_temperature, block_width, block_height, block_depth):
     i = (cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x)+1
     j = (cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y)+1
     k = (cuda.blockDim.z*cuda.blockIdx.z+cuda.threadIdx.z)+1

From a4dbe3c2f556b55da91d6367005e97621be84b8d Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 20:02:14 -0500
Subject: [PATCH 071/107] correct ghost info now received, output runtime info

---
 examples/cuda/gpudirect/jacobi3d/block.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
index 9a87e350..0c032a8c 100644
--- a/examples/cuda/gpudirect/jacobi3d/block.py
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -248,7 +248,6 @@ def updateAndPack(self):
                     self.d_ghosts[i].copy_to_host(self.h_ghosts[i])
         self.stream.synchronize()
 
-
     @coro
     def sendGhost(self, direction):
         send_ch = self.neighbor_channels[direction]
@@ -271,14 +270,16 @@ def recvGhosts(self):
                 ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[neighbor_idx],
                         post_buf_sizes = self.d_recv_ghosts_size[neighbor_idx]
                         )
+                recv_ghost = self.d_recv_ghosts[neighbor_idx]
             else:
                 self.h_ghosts[neighbor_idx] = ch.recv()
                 self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx],
                                                            stream=self.stream
                                                            )
+                recv_ghost = self.d_ghosts[neighbor_idx]
 
             kernels.invokeUnpackingKernel(self.d_temperature,
-                                          self.d_ghosts[neighbor_idx],
+                                          recv_ghost,
                                           ch.recv_direction,
                                           block_width,
                                           block_height,
@@ -317,5 +318,7 @@ def run(self, done_future):
         tend = time.time()
 
         if self.thisIndex == (0, 0, 0):
-            print(f'Elapsed time: {tend-tstart}')
+            elapsed_time = tend-tstart
+            print(f'Elapsed time: {round(elapsed_time,5)}')
+            print(f'Approximate time per iteration: {round(((elapsed_time/n_iters)*1e6),5)}')
         self.reduce(done_future)

From b8eca90ed753fede2b43977db36155801d467732 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Wed, 27 Jan 2021 21:01:22 -0500
Subject: [PATCH 072/107] Fix wrong assertion

---
 charm4py/channel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index e187f19f..4584c618 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -80,7 +80,7 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None,
         elif post_buf_addresses != None:
             gpu_recv_bufs = ret
             # ret = ret[:-1:1]
-            assert len(post_buffers) == len(gpu_recv_bufs)
+            assert len(post_buf_addresses) == len(gpu_recv_bufs)
             assert post_buf_sizes
             recv_future = charm.getGPUDirectDataFromAddresses(post_buf_addresses, post_buf_sizes, gpu_recv_bufs, stream_ptrs)
             recv_future.get()

From 60a250cda558413baf65ae39aade5bdbf3492ae7 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 21:36:13 -0500
Subject: [PATCH 073/107] use charmlib hooks for copying

---
 examples/cuda/gpudirect/jacobi3d/block.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
index 0c032a8c..c2af0b5d 100644
--- a/examples/cuda/gpudirect/jacobi3d/block.py
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -244,8 +244,12 @@ def updateAndPack(self):
                                             self.stream
                                             )
                 if not use_zerocopy:
-                    # TODO: change this to the CUDA hooks in charmlib
-                    self.d_ghosts[i].copy_to_host(self.h_ghosts[i])
+                    # self.d_ghosts[i].copy_to_host(self.h_ghosts[i])
+                    charm.lib.CudaDtoH(self.h_ghosts[i].__array_interface__['data'][0],
+                                       self.d_ghosts[i].__cuda_array_interface__['data'][0],
+                                       self.d_ghosts[i].nbytes, 0
+                                       )
+
         self.stream.synchronize()
 
     @coro
@@ -273,9 +277,14 @@ def recvGhosts(self):
                 recv_ghost = self.d_recv_ghosts[neighbor_idx]
             else:
                 self.h_ghosts[neighbor_idx] = ch.recv()
-                self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx],
-                                                           stream=self.stream
-                                                           )
+                # self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx],
+                #                                            stream=self.stream
+                #                                            )
+                charm.lib.CudaHtoD(self.d_ghosts[neighbor_idx].__cuda_array_interface__['data'][0],
+                                   self.h_ghosts[neighbor_idx].__array_interface__['data'][0],
+                                   self.d_ghosts[neighbor_idx].nbytes, 0
+                                   )
+
                 recv_ghost = self.d_ghosts[neighbor_idx]
 
             kernels.invokeUnpackingKernel(self.d_temperature,

From b700cba0671b31f96990d9fb0f8e20192fd22e03 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 27 Jan 2021 21:36:54 -0500
Subject: [PATCH 074/107] temporary hard-code stream 0

---
 charm4py/charmlib/charmlib_cython.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index c360c264..271af7c9 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -958,13 +958,13 @@ class CharmLib(object):
                        future_id
                        )
   def CudaHtoD(self, long destAddr, long srcAddr, int nbytes, long streamAddr):
-      CkCUDAHtoD(<void*>destAddr, <void*>srcAddr,<int>nbytes, (<cudaStream_t*> streamAddr)[0]);
+      CkCUDAHtoD(<void*>destAddr, <void*>srcAddr,<int>nbytes, 0);
 
   def CudaDtoH(self, long destAddr, long srcAddr, int nbytes, long streamAddr):
-    CkCUDADtoH(<void*>destAddr, <void*>srcAddr,<int>int(nbytes), (<cudaStream_t*> streamAddr)[0]);
+    CkCUDADtoH(<void*>destAddr, <void*>srcAddr,<int>int(nbytes), 0);
 
   def CudaStreamSynchronize(self, long streamAddr):
-    CkCUDAStreamSynchronize((<cudaStream_t*>streamAddr)[0])
+    CkCUDAStreamSynchronize(0)
 
 
 

From 475540a84df998c0c26c6fa0997f11b2a0be2a8b Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Wed, 27 Jan 2021 22:23:07 -0500
Subject: [PATCH 075/107] Charm4py Jacobi3D: Update timer outputs, make default
 iter 100

---
 examples/cuda/gpudirect/jacobi3d/block.py    | 5 +++--
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
index 0c032a8c..636f38ca 100644
--- a/examples/cuda/gpudirect/jacobi3d/block.py
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -319,6 +319,7 @@ def run(self, done_future):
 
         if self.thisIndex == (0, 0, 0):
             elapsed_time = tend-tstart
-            print(f'Elapsed time: {round(elapsed_time,5)}')
-            print(f'Approximate time per iteration: {round(((elapsed_time/n_iters)*1e6),5)}')
+            print(f'Elapsed time: {round(elapsed_time,3)} s')
+            print(f'Average time per iteration: {round(((elapsed_time/n_iters)*1e3),3)} ms')
+            print(f'Communication time per iteration: {round(((comm_time/n_iters)*1e3),3)} ms')
         self.reduce(done_future)
diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index 2160ed7e..c759807c 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -11,7 +11,7 @@ class Defaults(Enum):
     GRID_WIDTH = 512
     GRID_HEIGHT = 512
     GRID_DEPTH = 512
-    NUM_ITERS = 512
+    NUM_ITERS = 100
     WARMUP_ITERS = 10
     USE_ZEROCOPY = False
     PRINT_ELEMENTS = False

From c0e5398cdd63136db8b88baa959f7848a8cbb283 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Wed, 27 Jan 2021 23:30:32 -0500
Subject: [PATCH 076/107] Charm4py Jacobi3D: Revert to old host-staging method
 to avoid errors

---
 examples/cuda/gpudirect/jacobi3d/block.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
index f16aa11d..98665981 100644
--- a/examples/cuda/gpudirect/jacobi3d/block.py
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -244,11 +244,13 @@ def updateAndPack(self):
                                             self.stream
                                             )
                 if not use_zerocopy:
-                    # self.d_ghosts[i].copy_to_host(self.h_ghosts[i])
+                    self.d_ghosts[i].copy_to_host(self.h_ghosts[i], self.stream)
+                    '''
                     charm.lib.CudaDtoH(self.h_ghosts[i].__array_interface__['data'][0],
                                        self.d_ghosts[i].__cuda_array_interface__['data'][0],
                                        self.d_ghosts[i].nbytes, 0
                                        )
+                                       '''
 
         self.stream.synchronize()
 
@@ -277,14 +279,15 @@ def recvGhosts(self):
                 recv_ghost = self.d_recv_ghosts[neighbor_idx]
             else:
                 self.h_ghosts[neighbor_idx] = ch.recv()
-                # self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx],
-                #                                            stream=self.stream
-                #                                            )
+                self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx],
+                                                           stream=self.stream
+                                                           )
+                '''
                 charm.lib.CudaHtoD(self.d_ghosts[neighbor_idx].__cuda_array_interface__['data'][0],
                                    self.h_ghosts[neighbor_idx].__array_interface__['data'][0],
                                    self.d_ghosts[neighbor_idx].nbytes, 0
                                    )
-
+                '''
                 recv_ghost = self.d_ghosts[neighbor_idx]
 
             kernels.invokeUnpackingKernel(self.d_temperature,

From 91be5a60f3fc3e1cf98a42fdb698e092602f3ca7 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Wed, 27 Jan 2021 23:48:44 -0500
Subject: [PATCH 077/107] Revert back host-staging mechanism

---
 examples/cuda/gpudirect/jacobi3d/block.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
index 98665981..f96275a6 100644
--- a/examples/cuda/gpudirect/jacobi3d/block.py
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -216,12 +216,6 @@ def init_bounds(self, x, y, z):
         self.neighbors = neighbors
 
 
-    @coro
-    def sendGhosts(self):
-        for dir in range(kernels.DIR_COUNT):
-            if not self.bounds[dir]:
-                self.sendGhost(dir)
-
     def updateAndPack(self):
         kernels.invokeJacobiKernel(self.d_temperature,
                                    self.d_new_temperature,
@@ -244,16 +238,20 @@ def updateAndPack(self):
                                             self.stream
                                             )
                 if not use_zerocopy:
-                    self.d_ghosts[i].copy_to_host(self.h_ghosts[i], self.stream)
-                    '''
+                    #self.d_ghosts[i].copy_to_host(self.h_ghosts[i], self.stream)
                     charm.lib.CudaDtoH(self.h_ghosts[i].__array_interface__['data'][0],
                                        self.d_ghosts[i].__cuda_array_interface__['data'][0],
                                        self.d_ghosts[i].nbytes, 0
                                        )
-                                       '''
 
         self.stream.synchronize()
 
+    @coro
+    def sendGhosts(self):
+        for dir in range(kernels.DIR_COUNT):
+            if not self.bounds[dir]:
+                self.sendGhost(dir)
+
     @coro
     def sendGhost(self, direction):
         send_ch = self.neighbor_channels[direction]
@@ -279,6 +277,7 @@ def recvGhosts(self):
                 recv_ghost = self.d_recv_ghosts[neighbor_idx]
             else:
                 self.h_ghosts[neighbor_idx] = ch.recv()
+                '''
                 self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx],
                                                            stream=self.stream
                                                            )
@@ -287,7 +286,6 @@ def recvGhosts(self):
                                    self.h_ghosts[neighbor_idx].__array_interface__['data'][0],
                                    self.d_ghosts[neighbor_idx].nbytes, 0
                                    )
-                '''
                 recv_ghost = self.d_ghosts[neighbor_idx]
 
             kernels.invokeUnpackingKernel(self.d_temperature,

From 3069ce263356a07f25258b34b4059014c0017a83 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Thu, 28 Jan 2021 00:14:40 -0500
Subject: [PATCH 078/107] Fix recvGhost to work with more than 2 processes

---
 examples/cuda/gpudirect/jacobi3d/block.py | 66 ++++++++++++-----------
 1 file changed, 35 insertions(+), 31 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
index f96275a6..b99b534c 100644
--- a/examples/cuda/gpudirect/jacobi3d/block.py
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -265,37 +265,41 @@ def sendGhost(self, direction):
 
     @coro
     def recvGhosts(self):
-        for ch in charm.iwait(self.active_neighbor_channels):
-            # remember: we set 'recv_direction' member
-            # directly in the initialization phase
-            neighbor_idx = ch.recv_direction
-
-            if use_zerocopy:
-                ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[neighbor_idx],
-                        post_buf_sizes = self.d_recv_ghosts_size[neighbor_idx]
-                        )
-                recv_ghost = self.d_recv_ghosts[neighbor_idx]
-            else:
-                self.h_ghosts[neighbor_idx] = ch.recv()
-                '''
-                self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx],
-                                                           stream=self.stream
-                                                           )
-                '''
-                charm.lib.CudaHtoD(self.d_ghosts[neighbor_idx].__cuda_array_interface__['data'][0],
-                                   self.h_ghosts[neighbor_idx].__array_interface__['data'][0],
-                                   self.d_ghosts[neighbor_idx].nbytes, 0
-                                   )
-                recv_ghost = self.d_ghosts[neighbor_idx]
-
-            kernels.invokeUnpackingKernel(self.d_temperature,
-                                          recv_ghost,
-                                          ch.recv_direction,
-                                          block_width,
-                                          block_height,
-                                          block_depth,
-                                          self.stream
-                                          )
+        # Not using charm.iwait as it errors out with more than 2 processes
+        for dir in range(kernels.DIR_COUNT):
+            if not self.bounds[dir]:
+                self.recvGhost(dir)
+
+    @coro
+    def recvGhost(self, direction):
+        recv_ch = self.neighbor_channels[direction]
+
+        if use_zerocopy:
+            recv_ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[direction],
+                         post_buf_sizes = self.d_recv_ghosts_size[direction]
+                         )
+            recv_ghost = self.d_recv_ghosts[direction]
+        else:
+            self.h_ghosts[direction] = recv_ch.recv()
+            '''
+            self.d_ghosts[direction].copy_to_device(self.h_ghosts[direction],
+                                                    stream=self.stream
+                                                    )
+                                                    '''
+            charm.lib.CudaHtoD(self.d_ghosts[direction].__cuda_array_interface__['data'][0],
+                               self.h_ghosts[direction].__array_interface__['data'][0],
+                               self.d_ghosts[direction].nbytes, 0
+                               )
+            recv_ghost = self.d_ghosts[direction]
+
+        kernels.invokeUnpackingKernel(self.d_temperature,
+                                      recv_ghost,
+                                      direction,
+                                      block_width,
+                                      block_height,
+                                      block_depth,
+                                      self.stream
+                                      )
         self.stream.synchronize()
 
     @coro

From 12483a018345688a5ef0e20930f4256564b9ad9e Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Thu, 28 Jan 2021 00:56:52 -0500
Subject: [PATCH 079/107] Fix chare dimension calculation error

---
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index c759807c..2a2cacbe 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -159,7 +159,7 @@ def calc_num_chares_per_dim(num_chares, grid_w, grid_h, grid_d):
             while ipy <= nremain:
                 if not nremain % ipy:
                     ipz = nremain // ipy
-                    surf = area[0] / ipx / ipy + area[1] / ipz + area[2] / ipy / ipz
+                    surf = area[0] / ipx / ipy + area[1] / ipx / ipz + area[2] / ipy / ipz
 
                     if surf < bestsurf:
                         bestsurf = surf

From b02d6d79b9e0e47f9bb7a0f1169c0d9c4d8e6a1e Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Thu, 28 Jan 2021 00:57:04 -0500
Subject: [PATCH 080/107] Add scripts

---
 .../gpudirect/jacobi3d/scripts/charm4py.sh    | 46 ++++++++++++++
 .../jacobi3d/scripts/process_times.py         | 63 +++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100755 examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh
 create mode 100755 examples/cuda/gpudirect/jacobi3d/scripts/process_times.py

diff --git a/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh b/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh
new file mode 100755
index 00000000..2d07c54c
--- /dev/null
+++ b/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#BSUB -W 30
+#BSUB -P csc357
+#BSUB -nnodes 2
+#BSUB -J jacobi3d-charm4py-weak-n2
+
+# These need to be changed between submissions
+file=jacobi3d.py
+n_nodes=2
+n_procs=$((n_nodes * 6))
+grid_width=3072
+grid_height=1536
+grid_depth=1536
+
+# Function to display commands
+exe() { echo "\$ $@" ; "$@" ; }
+
+cd $HOME/work/charm4py/examples/cuda/gpudirect/jacobi3d
+
+conda activate charm4py
+
+export LD_LIBRARY_PATH=$HOME/work/ucx/install/lib:$HOME/work/pmix-3.1.5/install/lib:/sw/summit/gdrcopy/2.0/lib64:$LD_LIBRARY_PATH
+export UCX_MEMTYPE_CACHE=n
+
+ppn=1
+pemap="L0,4,8,84,88,92"
+n_iters=100
+warmup_iters=10
+
+echo "# Charm4py Jacobi3D Performance Benchmarking (GPUDirect off)"
+
+for iter in 1 2 3
+do
+  date
+  echo "# Run $iter"
+  exe jsrun -n$n_procs -a1 -c$ppn -g1 -K3 -r6 --smpiargs="-disable_gpu_hooks" python3 ./$file -x $grid_width -y $grid_height -z $grid_depth -w $warmup_iters -i $n_iters +ppn $ppn +pemap $pemap
+done
+
+echo "# Charm4py Jacobi3D Performance Benchmarking (GPUDirect on)"
+
+for iter in 1 2 3
+do
+  date
+  echo "# Run $iter"
+  exe jsrun -n$n_procs -a1 -c$ppn -g1 -K3 -r6 --smpiargs="-disable_gpu_hooks" python3 ./$file -x $grid_width -y $grid_height -z $grid_depth -w $warmup_iters -i $n_iters +ppn $ppn +pemap $pemap -d
+done
diff --git a/examples/cuda/gpudirect/jacobi3d/scripts/process_times.py b/examples/cuda/gpudirect/jacobi3d/scripts/process_times.py
new file mode 100755
index 00000000..0063b79d
--- /dev/null
+++ b/examples/cuda/gpudirect/jacobi3d/scripts/process_times.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+import os
+import sys
+import csv
+import statistics
+
+if len(sys.argv) != 4:
+  print('Please use', sys.argv[0], '[job name] [start node count] [end node count]')
+  exit()
+
+job_name = sys.argv[1]
+start_node_count = int(sys.argv[2])
+end_node_count = int(sys.argv[3])
+
+csv_filename = job_name + '.csv'
+csv_file = open(csv_filename, 'w', newline='')
+writer = csv.writer(csv_file)
+writer.writerow(['Number of GPUs', 'Charm4py-H-Total', 'error', 'Charm4py-H-Comm', 'error', 'Charm4py-D-Total', 'error', 'Charm4py-D-Comm', 'error'])
+
+def is_host(index):
+  return index % 6 == 0 or index % 6 == 1 or index % 6 == 2
+
+node_count_list = []
+cur_node_count = start_node_count
+while cur_node_count <= end_node_count:
+  node_count_list.append(cur_node_count)
+  cur_node_count *= 2
+
+for node_count in node_count_list:
+  print('Node count:', str(node_count))
+  total_str = 'grep -ir "Average time per" ' + job_name + '-n' + str(node_count) + '.* | cut -d " " -f5'
+  comm_str = 'grep -ir "Communication time" ' + job_name + '-n' + str(node_count) + '.* | cut -d " " -f5'
+
+  total_stream = os.popen(total_str)
+  total_lines = total_stream.readlines()
+  total_times = list(map(lambda x: x, list(map(float, list(map(str.rstrip, total_lines))))))
+  comm_stream = os.popen(comm_str)
+  comm_lines = comm_stream.readlines()
+  comm_times = list(map(lambda x: x, list(map(float, list(map(str.rstrip, comm_lines))))))
+
+  h_total_times = [total_times[i] for i in range(len(total_times)) if is_host(i)]
+  h_comm_times = [comm_times[i] for i in range(len(comm_times)) if is_host(i)]
+  d_total_times = [total_times[i] for i in range(len(total_times)) if not is_host(i)]
+  d_comm_times = [comm_times[i] for i in range(len(comm_times)) if not is_host(i)]
+  print('H total:', h_total_times)
+  print('H comm:', h_comm_times)
+  print('D total:', d_total_times)
+  print('D comm:', d_comm_times)
+
+  h_total_avg = round(statistics.mean(h_total_times), 2)
+  h_total_stdev = round(statistics.stdev(h_total_times), 2)
+  h_comm_avg = round(statistics.mean(h_comm_times), 2)
+  h_comm_stdev = round(statistics.stdev(h_comm_times), 2)
+  d_total_avg = round(statistics.mean(d_total_times), 2)
+  d_total_stdev = round(statistics.stdev(d_total_times), 2)
+  d_comm_avg = round(statistics.mean(d_comm_times), 2)
+  d_comm_stdev = round(statistics.stdev(d_comm_times), 2)
+  print('H total avg:', h_total_avg, 'stdev:', h_total_stdev)
+  print('H comm avg:', h_comm_avg, 'stdev:', h_comm_stdev)
+  print('D total avg:', d_total_avg, 'stdev:', d_total_stdev)
+  print('D comm avg:', d_comm_avg, 'stdev:', d_comm_stdev)
+
+  writer.writerow([str(node_count), str(h_total_avg), str(h_total_stdev), str(h_comm_avg), str(h_comm_stdev), str(d_total_avg), str(d_total_stdev), str(d_comm_avg), str(d_comm_stdev)])

From 20913a94e8be5256625f1c6a37d59b44427f9ece Mon Sep 17 00:00:00 2001
From: Jaemin Choi <jchoi157@illinois.edu>
Date: Thu, 28 Jan 2021 15:57:42 -0500
Subject: [PATCH 081/107] Update Charm4py script, need to use source activate
 instead of conda activate

---
 .../gpudirect/jacobi3d/scripts/charm4py.sh     | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh b/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh
index 2d07c54c..54fcaa78 100755
--- a/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh
+++ b/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh
@@ -1,26 +1,28 @@
 #!/bin/bash
 #BSUB -W 30
 #BSUB -P csc357
-#BSUB -nnodes 2
-#BSUB -J jacobi3d-charm4py-weak-n2
+#BSUB -nnodes 256
+#BSUB -J jacobi3d-charm4py-strong-n256
 
 # These need to be changed between submissions
 file=jacobi3d.py
-n_nodes=2
+n_nodes=256
 n_procs=$((n_nodes * 6))
 grid_width=3072
-grid_height=1536
-grid_depth=1536
+grid_height=3072
+grid_depth=3072
 
 # Function to display commands
 exe() { echo "\$ $@" ; "$@" ; }
 
 cd $HOME/work/charm4py/examples/cuda/gpudirect/jacobi3d
 
-conda activate charm4py
+#exe conda init bash
+#exe conda activate charm4py
+exe source activate charm4py
 
-export LD_LIBRARY_PATH=$HOME/work/ucx/install/lib:$HOME/work/pmix-3.1.5/install/lib:/sw/summit/gdrcopy/2.0/lib64:$LD_LIBRARY_PATH
-export UCX_MEMTYPE_CACHE=n
+exe export LD_LIBRARY_PATH=$HOME/work/ucx/install/lib:$HOME/work/pmix-3.1.5/install/lib:/sw/summit/gdrcopy/2.0/lib64:$LD_LIBRARY_PATH
+exe export UCX_MEMTYPE_CACHE=n
 
 ppn=1
 pemap="L0,4,8,84,88,92"

From 581dae957171fef7668e46804ccf4c695aa84f14 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 17 Mar 2021 10:21:06 -0500
Subject: [PATCH 082/107] Add iwait_map function to fix the issue

---
 charm4py/charm.py   | 50 +++++++++++++++++++++++++++++++++++++++++++++
 charm4py/threads.py |  6 +++++-
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index a13935fe..b34f287f 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -28,6 +28,7 @@
 from . import reduction
 from . import wait
 import array
+import greenlet
 try:
     import numpy
 except ImportError:
@@ -820,12 +821,61 @@ def iwait(self, objs):
                 n -= 1
                 yield obj
             else:
+                print('waitready this')
                 obj.waitReady(f)
         while n > 0:
+            print('thread paused')
             obj = self.threadMgr.pauseThread()
+            print('thread resumed', obj)
             n -= 1
+            print('n', n)
             yield obj
 
+    def iwait_map(self, func, objs):
+        n = len(objs)
+        f = LocalFuture()
+        done_fut = LocalFuture()
+        remaining_grs = [n]
+
+        def map_func(remaining, obj):
+            gr = greenlet.getcurrent()
+            gr.notify = gr.parent.notify
+            gr.obj = gr.parent.obj
+            gr.fu = 1
+            func(obj)
+            remaining[0] -= 1
+
+        def gr_func():
+            return map_func(remaining_grs, obj)
+
+        for obj in objs:
+            if obj.ready():
+                new_gr = greenlet.greenlet(gr_func)
+                n -= 1
+                new_gr.switch()
+            else:
+                obj.waitReady(f)
+        while n > 0:
+            obj = self.threadMgr.pauseThread()
+            # if obj is None, then we are being resumed by a finishing greenlet
+            # and should pause
+            if obj is None:
+                continue
+
+            new_gr = greenlet.greenlet(gr_func)
+            n -= 1
+            obj = new_gr.switch()
+
+            # if ret is not None, then we are being resumed
+            #by the thread manager because a new object is ready to receive
+            if obj:
+                new_gr = greenlet.greenlet(gr_func)
+                n -= 1
+                new_gr.switch()
+
+        while remaining_grs[0]:
+            self.threadMgr.pauseThread()
+
     def wait(self, objs):
         for o in self.iwait(objs):
             pass
diff --git a/charm4py/threads.py b/charm4py/threads.py
index e05678cf..38343a2b 100644
--- a/charm4py/threads.py
+++ b/charm4py/threads.py
@@ -166,11 +166,15 @@ def pauseThread(self):
         if gr.notify:
             obj = gr.obj
             obj._thread_notify_target.threadPaused(obj._thread_notify_data)
-        if gr.parent != main_gr:
+        if False and gr.parent != main_gr:
             # this can happen with threaded chare constructors that are called
             # "inline" by Charm++ on the PE where the collection is created.
             # Initially it will switch back to the parent thread, but after that
             # we make the parent to be the main thread
+            # try:
+                # if gr.fu:
+                    # return main_gr.switch()
+            # except:
             parent = gr.parent
             gr.parent = main_gr
             return parent.switch()

From 65ad6116aeebbb68d8f9c2701b84d528e2809460 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 31 Mar 2021 16:52:47 -0400
Subject: [PATCH 083/107] update to use charm.iwait_map

---
 examples/cuda/gpudirect/jacobi3d/block.py    | 58 +++++++++++++++-----
 examples/cuda/gpudirect/jacobi3d/jacobi3d.py |  2 +-
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py
index b99b534c..4732ecb7 100644
--- a/examples/cuda/gpudirect/jacobi3d/block.py
+++ b/examples/cuda/gpudirect/jacobi3d/block.py
@@ -246,13 +246,13 @@ def updateAndPack(self):
 
         self.stream.synchronize()
 
-    @coro
     def sendGhosts(self):
+        count = 0
         for dir in range(kernels.DIR_COUNT):
             if not self.bounds[dir]:
                 self.sendGhost(dir)
+                count += 1
 
-    @coro
     def sendGhost(self, direction):
         send_ch = self.neighbor_channels[direction]
 
@@ -263,19 +263,48 @@ def sendGhost(self, direction):
         else:
             send_ch.send(self.h_ghosts[direction])
 
-    @coro
     def recvGhosts(self):
-        # Not using charm.iwait as it errors out with more than 2 processes
-        for dir in range(kernels.DIR_COUNT):
-            if not self.bounds[dir]:
-                self.recvGhost(dir)
-
-    @coro
-    def recvGhost(self, direction):
-        recv_ch = self.neighbor_channels[direction]
-
+        charm.iwait_map(self.recvGhost, self.active_neighbor_channels)
+        # for ch in charm.iwait(self.active_neighbor_channels):
+        #     # remember: we set 'recv_direction' member
+        #     # directly in the initialization phase
+        #     neighbor_idx = ch.recv_direction
+
+        #     if use_zerocopy:
+        #         ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[neighbor_idx],
+        #                 post_buf_sizes = self.d_recv_ghosts_size[neighbor_idx]
+        #                 )
+        #         recv_ghost = self.d_recv_ghosts[neighbor_idx]
+        #     else:
+        #         self.h_ghosts[neighbor_idx] = ch.recv()
+        #         '''
+        #         self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx],
+        #                                                    stream=self.stream
+        #                                                    )
+        #         '''
+        #         charm.lib.CudaHtoD(self.d_ghosts[neighbor_idx].__cuda_array_interface__['data'][0],
+        #                            self.h_ghosts[neighbor_idx].__array_interface__['data'][0],
+        #                            self.d_ghosts[neighbor_idx].nbytes, 0
+        #                            )
+        #         recv_ghost = self.d_ghosts[neighbor_idx]
+
+        #     kernels.invokeUnpackingKernel(self.d_temperature,
+        #                                   recv_ghost,
+        #                                   ch.recv_direction,
+        #                                   block_width,
+        #                                   block_height,
+        #                                   block_depth,
+        #                                   self.stream
+        #                                   )
+        # # Not using charm.iwait as it errors out with more than 2 processes
+        # for dir in range(kernels.DIR_COUNT):
+        #     if not self.bounds[dir]:
+        #         self.recvGhost(dir)
+
+    def recvGhost(self, recv_ch):
+        direction = recv_ch.recv_direction
         if use_zerocopy:
-            recv_ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[direction],
+            f = recv_ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[direction],
                          post_buf_sizes = self.d_recv_ghosts_size[direction]
                          )
             recv_ghost = self.d_recv_ghosts[direction]
@@ -302,7 +331,7 @@ def recvGhost(self, direction):
                                       )
         self.stream.synchronize()
 
-    @coro
+    # @coro
     def exchangeGhosts(self):
         self.d_temperature, self.d_new_temperature = \
             self.d_new_temperature, self.d_temperature
@@ -328,7 +357,6 @@ def run(self, done_future):
             if current_iter >= warmup_iters:
                 comm_time += time.time() - comm_start_time
 
-
         tend = time.time()
 
         if self.thisIndex == (0, 0, 0):
diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
index 2a2cacbe..b3f9bcd7 100644
--- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
+++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py
@@ -128,13 +128,13 @@ def main(args):
                                   ).get()
 
     init_done_future = Future()
+    run_future = Future()
     block_proxy = Array(Block,
                         dims=[n_chares_x, n_chares_y, n_chares_z],
                         args = [init_done_future]
                         )
     init_done_future.get()
 
-    run_future = Future()
     block_proxy.run(run_future)
     run_future.get()
     charm.exit()

From 69dd6f51d633159b26cd76e89989cc82ba02ff1d Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 31 Mar 2021 16:55:17 -0400
Subject: [PATCH 084/107] Fix charm.iwait_map bug

---
 charm4py/channel.py |  1 +
 charm4py/charm.py   | 31 +++++++++++--------------------
 charm4py/threads.py |  3 ++-
 3 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index aaf058ec..74726b95 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -68,4 +68,5 @@ def recv(self):
             ret = self.recv_fut.get()
             self.recv_fut = None
         self.recv_seqno = (self.recv_seqno + 1) % CHAN_BUF_SIZE
+
         return ret
diff --git a/charm4py/charm.py b/charm4py/charm.py
index b34f287f..8f847c78 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -816,25 +816,21 @@ def triggerCallable(self, tag):
     def iwait(self, objs):
         n = len(objs)
         f = LocalFuture()
+
         for obj in objs:
             if obj.ready():
                 n -= 1
                 yield obj
             else:
-                print('waitready this')
                 obj.waitReady(f)
         while n > 0:
-            print('thread paused')
             obj = self.threadMgr.pauseThread()
-            print('thread resumed', obj)
             n -= 1
-            print('n', n)
             yield obj
 
     def iwait_map(self, func, objs):
         n = len(objs)
         f = LocalFuture()
-        done_fut = LocalFuture()
         remaining_grs = [n]
 
         def map_func(remaining, obj):
@@ -852,27 +848,22 @@ def gr_func():
             if obj.ready():
                 new_gr = greenlet.greenlet(gr_func)
                 n -= 1
-                new_gr.switch()
+                obj = new_gr.switch()
+                while obj:
+                    assert isinstance(obj, Channel)
+                    new_gr = greenlet.greenlet(gr_func)
+                    n -= 1
+                    obj = new_gr.switch()
             else:
                 obj.waitReady(f)
         while n > 0:
             obj = self.threadMgr.pauseThread()
-            # if obj is None, then we are being resumed by a finishing greenlet
-            # and should pause
-            if obj is None:
-                continue
-
-            new_gr = greenlet.greenlet(gr_func)
-            n -= 1
-            obj = new_gr.switch()
-
-            # if ret is not None, then we are being resumed
-            #by the thread manager because a new object is ready to receive
-            if obj:
+            while obj:
+                assert isinstance(obj, Channel)
                 new_gr = greenlet.greenlet(gr_func)
                 n -= 1
-                new_gr.switch()
-
+                obj = new_gr.switch()
+   
         while remaining_grs[0]:
             self.threadMgr.pauseThread()
 
diff --git a/charm4py/threads.py b/charm4py/threads.py
index 38343a2b..fd7cec3e 100644
--- a/charm4py/threads.py
+++ b/charm4py/threads.py
@@ -179,7 +179,8 @@ def pauseThread(self):
             gr.parent = main_gr
             return parent.switch()
         else:
-            return main_gr.switch()
+            ret_val = main_gr.switch()
+            return ret_val
 
     def _resumeThread(self, gr, arg):
         """ Deposit a result or signal that a local entry method thread is waiting on,

From afeaa6b3329aabfcb4e13dfda0ed1b2788ec1487 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 31 Mar 2021 16:00:51 -0500
Subject: [PATCH 085/107] no need for channel assertion

---
 charm4py/charm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index 8f847c78..f78f5188 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -850,7 +850,6 @@ def gr_func():
                 n -= 1
                 obj = new_gr.switch()
                 while obj:
-                    assert isinstance(obj, Channel)
                     new_gr = greenlet.greenlet(gr_func)
                     n -= 1
                     obj = new_gr.switch()
@@ -859,7 +858,6 @@ def gr_func():
         while n > 0:
             obj = self.threadMgr.pauseThread()
             while obj:
-                assert isinstance(obj, Channel)
                 new_gr = greenlet.greenlet(gr_func)
                 n -= 1
                 obj = new_gr.switch()

From 0f34c1a0a4ef7cd0f8c13e87d07009e3eed18fef Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 21 Apr 2021 11:55:42 -0400
Subject: [PATCH 086/107] correctly differentiate between instances when GPU
 data sent with other types

---
 charm4py/channel.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index 4584c618..2e84d481 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -72,14 +72,25 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None,
         self.recv_seqno = (self.recv_seqno + 1) % CHAN_BUF_SIZE
 
         if post_buffers:
-            gpu_recv_bufs = ret[-1]
-            # ret = ret[:-1:1]
+            if isinstance(ret, tuple):
+                print(ret)
+                gpu_recv_bufs = ret[-1]
+                ret = ret[0:-1]
+                if len(ret) == 1:
+                    ret = ret[0]
+            else:
+                gpu_recv_bufs = ret
             assert len(post_buffers) == len(gpu_recv_bufs)
             recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
             recv_future.get()
         elif post_buf_addresses != None:
-            gpu_recv_bufs = ret
-            # ret = ret[:-1:1]
+            if isinstance(ret, tuple):
+                gpu_recv_bufs = ret[-1]
+                ret = ret[0:-1]
+                if len(ret) == 1:
+                    ret = ret[0]
+            else:
+                gpu_recv_bufs = ret
             assert len(post_buf_addresses) == len(gpu_recv_bufs)
             assert post_buf_sizes
             recv_future = charm.getGPUDirectDataFromAddresses(post_buf_addresses, post_buf_sizes, gpu_recv_bufs, stream_ptrs)

From 92f2cba4743a1baab59de72a13feef91f1661699 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 21 Apr 2021 11:56:49 -0400
Subject: [PATCH 087/107] numpy arrays can now be sent with GPU-direct data

---
 charm4py/charm.py                     |  1 +
 charm4py/charmlib/ccharm.pxd          |  8 +++
 charm4py/charmlib/charmlib_cython.pyx | 73 +++++++++++++++++++--------
 3 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index 4b2235d8..71df707f 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -134,6 +134,7 @@ def __init__(self):
         self.CkArraySend = self.lib.CkArraySend
         self.CkArraySendWithDeviceData = self.lib.CkArraySendWithDeviceData
         self.CkArraySendWithDeviceDataFromPointers = self.lib.CkArraySendWithDeviceDataFromPointers
+        self.CkCudaEnabled = self.lib.CkCudaEnabled
         self.reducers = reduction.ReducerContainer(self)
         self.redMgr = reduction.ReductionManager(self, self.reducers)
         self.mainchareRegistered = False
diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index e11c204f..9f2e9a9c 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -82,6 +82,14 @@ cdef extern from "charm.h":
                                   long *devBufSizesInBytes,
                                   long *streamPtrs, int numDevBufs
                                   );
+    void CkChareExtSendWithDeviceData_multi(int aid, int *idx, int ndims,
+                                            int epIdx, int num_bufs, char **bufs,
+                                            int *buf_sizes,
+                                            long *devBufPtrs,
+                                            long *devBufSizesInBytes,
+                                            long *streamPtrs, int numDevBufs
+                                           );
+
 
     void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int));
     void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes,
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 271af7c9..b8ab34d8 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -477,6 +477,7 @@ class CharmLib(object):
     assert num_direct_buffers <= NUM_DCOPY_BUFS
     global gpu_direct_device_ptrs
     global gpu_direct_stream_ptrs
+    global cur_buf
 
     if stream_ptrs:
       for i in range(num_direct_buffers):
@@ -484,12 +485,24 @@ class CharmLib(object):
     else:
       memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers)
 
-    CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0),
-                                 gpu_direct_device_ptrs,
-                                 gpu_direct_buff_sizes,
-                                 gpu_direct_stream_ptrs,
-                                 num_direct_buffers
-                                 )
+    if cur_buf <= 1:
+      CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0),
+                                   gpu_direct_device_ptrs,
+                                   gpu_direct_buff_sizes,
+                                   gpu_direct_stream_ptrs,
+                                   num_direct_buffers
+                                   )
+    else:
+      send_bufs[0] = <char*>msg0
+      send_buf_sizes[0] = <int>len(msg0)
+      CkChareExtSendWithDeviceData_multi(array_id, c_index, ndims, ep,
+                                         cur_buf, send_bufs, send_buf_sizes,
+                                         gpu_direct_device_ptrs,
+                                         gpu_direct_buff_sizes,
+                                         gpu_direct_stream_ptrs,
+                                         num_direct_buffers
+                                         )
+      cur_buf = 1
     gpu_direct_buf_idx = 0
 
   def CkArraySendWithDeviceDataFromPointers(self, int array_id, index not None, int ep,
@@ -499,7 +512,7 @@ class CharmLib(object):
 
     cdef int i = 0
     cdef int ndims = len(index)
-    # assert ndims == 1
+    global cur_buf
     for i in range(ndims): c_index[i] = index[i]
     msg0, dcopy = msg
     dcopy = None
@@ -510,15 +523,28 @@ class CharmLib(object):
     else:
       memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs)
 
-    CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0),
-                                 <long*>gpu_src_ptrs.data.as_voidptr,
-                                 <long*>gpu_src_sizes.data.as_voidptr,
-                                 gpu_direct_stream_ptrs,
-                                 num_bufs
-                                 )
+    if cur_buf <= 1:
+      CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0),
+                                   <long*>gpu_src_ptrs.data.as_voidptr,
+                                   <long*>gpu_src_sizes.data.as_voidptr,
+                                   gpu_direct_stream_ptrs,
+                                   num_bufs
+                                  )
+    else:
+      send_bufs[0] = <char*>msg0
+      send_buf_sizes[0] = <int>len(msg0)
+      CkChareExtSendWithDeviceData_multi(array_id, c_index, ndims, ep,
+                                         cur_buf, send_bufs, send_buf_sizes,
+                                         <long*>gpu_src_ptrs.data.as_voidptr,
+                                         <long*>gpu_src_sizes.data.as_voidptr,
+                                         gpu_direct_stream_ptrs,
+                                         num_bufs
+                                         )
+      cur_buf = 1
     gpu_direct_buf_idx = 0
 
-
+  def CkCudaEnabled(self):
+    return bool(CkCudaEnabled())
 
   def CkArraySend(self, int array_id, index not None, int ep, msg not None):
     global cur_buf
@@ -860,13 +886,15 @@ class CharmLib(object):
       msg = emptyMsg
     else:
       direct_copy_hdr = []  # goes to header
-      args = list(msgArgs)
       global cur_buf
       global gpu_direct_buf_idx
       global gpu_direct_device_ptrs
       cur_buf = 1
       gpu_direct_buf_idx = 0
-      for i in range(len(args)):
+      # GPU-direct buffers will not be sent
+      args_to_send = list()
+      n_gpu_bufs = 0
+      for i in range(len(msgArgs)):
         arg = msgArgs[i]
         if CkCudaEnabled() and hasattr(arg, '__cuda_array_interface__'):
           if pack_gpu:
@@ -878,32 +906,33 @@ class CharmLib(object):
             gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes
             cuda_dev_info = True
             gpu_direct_buf_idx += 1
-          args[i] = None  # TODO: should this be done?
+          n_gpu_bufs += 1
           continue
         elif isinstance(arg, np.ndarray) and not arg.dtype.hasobject:
           np_array = arg
           nbytes = np_array.nbytes
-          direct_copy_hdr.append((i, 2, (arg.shape, np_array.dtype.name), nbytes))
+          direct_copy_hdr.append((i-n_gpu_bufs, 2, (arg.shape, np_array.dtype.name), nbytes))
           send_bufs[cur_buf] = <char*>np_array.data
         elif isinstance(arg, bytes):
           nbytes = len(arg)
-          direct_copy_hdr.append((i, 0, (), nbytes))
+          direct_copy_hdr.append((i-n_gpu_bufs, 0, (), nbytes))
           send_bufs[cur_buf] = <char*>arg
         elif isinstance(arg, array.array):
           a = arg
           #nbytes = arg.buffer_info()[1] * arg.itemsize
           nbytes = len(a) * a.itemsize # NOTE that cython's array C interface doesn't expose itemsize attribute
-          direct_copy_hdr.append((i, 1, (a.typecode), nbytes))
+          direct_copy_hdr.append((i-n_gpu_bufs, 1, (a.typecode), nbytes))
           send_bufs[cur_buf] = <char*>a.data.as_voidptr
         else:
+          args_to_send.append(arg)
           continue
-        args[i] = None  # will direct-copy this arg so remove from args list
+        args_to_send.append(None)
         send_buf_sizes[cur_buf] = <int>nbytes
         if PROFILING: dcopy_size += nbytes
         cur_buf += 1
       if len(direct_copy_hdr) > 0: header[b'dcopy'] = direct_copy_hdr
       try:
-        msg = dumps((header, args), PICKLE_PROTOCOL)
+        msg = dumps((header, args_to_send), PICKLE_PROTOCOL)
       except:
         global cur_buf
         global gpu_direct_buf_idx

From 760e7162843f6ceff7e4922fab7cb36661221312 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 22 Apr 2021 14:20:12 -0400
Subject: [PATCH 088/107] removed debug print

---
 charm4py/channel.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index 2e84d481..22e29efb 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -73,7 +73,6 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None,
 
         if post_buffers:
             if isinstance(ret, tuple):
-                print(ret)
                 gpu_recv_bufs = ret[-1]
                 ret = ret[0:-1]
                 if len(ret) == 1:

From c254008dddfab68169119874e48c691ff1782363 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 22 Apr 2021 14:20:45 -0400
Subject: [PATCH 089/107] no need for different function when buffers + gpu
 arrays sent in same message

---
 charm4py/charmlib/ccharm.pxd          | 19 +++++----------
 charm4py/charmlib/charmlib_cython.pyx | 34 +++++++--------------------
 2 files changed, 15 insertions(+), 38 deletions(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index 9f2e9a9c..12db2492 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -76,19 +76,12 @@ cdef extern from "charm.h":
     int CkCudaEnabled();
     int CUDAPointerOnDevice(const void *ptr);
     void CkChareExtSendWithDeviceData(int aid, int *idx, int ndims,
-                                  int epIdx, int num_bufs, char *msg,
-                                  int msgSize,
-                                  long *devBufPtrs,
-                                  long *devBufSizesInBytes,
-                                  long *streamPtrs, int numDevBufs
-                                  );
-    void CkChareExtSendWithDeviceData_multi(int aid, int *idx, int ndims,
-                                            int epIdx, int num_bufs, char **bufs,
-                                            int *buf_sizes,
-                                            long *devBufPtrs,
-                                            long *devBufSizesInBytes,
-                                            long *streamPtrs, int numDevBufs
-                                           );
+                                      int epIdx, int num_bufs, char **bufs,
+                                      int *buf_sizes,
+                                      long *devBufPtrs,
+                                      long *devBufSizesInBytes,
+                                      long *streamPtrs, int numDevBufs
+                                     );
 
 
     void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int));
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index b8ab34d8..58951fb1 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -485,23 +485,15 @@ class CharmLib(object):
     else:
       memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers)
 
-    if cur_buf <= 1:
-      CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0),
+      send_bufs[0] = <char*>msg0
+      send_buf_sizes[0] = <int>len(msg0)
+      CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep,
+                                   cur_buf, send_bufs, send_buf_sizes,
                                    gpu_direct_device_ptrs,
                                    gpu_direct_buff_sizes,
                                    gpu_direct_stream_ptrs,
                                    num_direct_buffers
-                                   )
-    else:
-      send_bufs[0] = <char*>msg0
-      send_buf_sizes[0] = <int>len(msg0)
-      CkChareExtSendWithDeviceData_multi(array_id, c_index, ndims, ep,
-                                         cur_buf, send_bufs, send_buf_sizes,
-                                         gpu_direct_device_ptrs,
-                                         gpu_direct_buff_sizes,
-                                         gpu_direct_stream_ptrs,
-                                         num_direct_buffers
-                                         )
+                                  )
       cur_buf = 1
     gpu_direct_buf_idx = 0
 
@@ -523,23 +515,15 @@ class CharmLib(object):
     else:
       memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs)
 
-    if cur_buf <= 1:
-      CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0),
+      send_bufs[0] = <char*>msg0
+      send_buf_sizes[0] = <int>len(msg0)
+      CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep,
+                                   cur_buf, send_bufs, send_buf_sizes,
                                    <long*>gpu_src_ptrs.data.as_voidptr,
                                    <long*>gpu_src_sizes.data.as_voidptr,
                                    gpu_direct_stream_ptrs,
                                    num_bufs
                                   )
-    else:
-      send_bufs[0] = <char*>msg0
-      send_buf_sizes[0] = <int>len(msg0)
-      CkChareExtSendWithDeviceData_multi(array_id, c_index, ndims, ep,
-                                         cur_buf, send_bufs, send_buf_sizes,
-                                         <long*>gpu_src_ptrs.data.as_voidptr,
-                                         <long*>gpu_src_sizes.data.as_voidptr,
-                                         gpu_direct_stream_ptrs,
-                                         num_bufs
-                                         )
       cur_buf = 1
     gpu_direct_buf_idx = 0
 

From 75f358d75d226ca5828446f48b00c972bcdb66bd Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 22 Apr 2021 14:28:25 -0400
Subject: [PATCH 090/107] tests sending one device array

---
 tests/cuda/single_array.py | 108 +++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 tests/cuda/single_array.py

diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py
new file mode 100644
index 00000000..f720c439
--- /dev/null
+++ b/tests/cuda/single_array.py
@@ -0,0 +1,108 @@
+from charm4py import charm, Chare, Array, coro, Future, Channel, Group, ArrayMap
+import time
+import numpy as np
+from numba import cuda
+import array
+
+
+class A(Chare):
+    def __init__(self, msg_size):
+        self.msg_size = msg_size
+
+
+    @coro
+    def run(self, done_future, addr_optimization = False):
+        partner = self.thisProxy[int(not self.thisIndex[0])]
+        partner_channel = Channel(self, partner)
+
+        device_data = cuda.device_array(self.msg_size, dtype='int8')
+        # if addr_optimization:
+        d_addr = array.array('L', [0])
+        d_size = array.array('L', [0])
+
+        d_addr[0] = device_data.__cuda_array_interface__['data'][0]
+        d_size[0] = device_data.nbytes
+
+        my_stream = cuda.stream()
+
+        if self.thisIndex[0]:
+            host_data = np.zeros(self.msg_size, dtype='int8')
+            host_data.fill(5)
+            device_data.copy_to_device(host_data)
+            if addr_optimization:
+                partner_channel.send(1, 2, "hello",
+                                     np.ones(self.msg_size, dtype='int8'),
+                                     gpu_src_ptrs = d_addr, gpu_src_sizes = d_size
+                                     )
+                p_data = partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size)
+            else:
+                partner_channel.send(1,2, "hello",
+                                     device_data,
+                                     np.ones(self.msg_size, dtype='int8')
+                                     )
+                p_data = partner_channel.recv(device_data)
+
+            assert p_data == (2, 3)
+            h_ary = device_data.copy_to_host()
+            assert np.array_equal(h_ary, host_data)
+
+            if addr_optimization:
+                partner_channel.send(gpu_src_ptrs = d_addr, gpu_src_sizes = d_size)
+                partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size)
+            else:
+                partner_channel.send(device_data)
+                partner_channel.recv(device_data)
+
+            h_ary = device_data.copy_to_host()
+            assert np.array_equal(h_ary, host_data)
+        else:
+            h_data = np.ones(self.msg_size, dtype='int8')
+            if addr_optimization:
+                p_data = partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size)
+            else:
+                p_data = partner_channel.recv(device_data)
+            p_data, p_host_arr = p_data[0:-1], p_data[-1]
+            recvd = device_data.copy_to_host()
+
+            compare = np.zeros(self.msg_size, dtype='int8')
+            compare.fill(5)
+            assert np.array_equal(recvd, compare)
+            assert np.array_equal(np.ones(self.msg_size, dtype='int8'), p_host_arr)
+            assert p_data == (1, 2, "hello")
+
+            if addr_optimization:
+                partner_channel.send(2,3, gpu_src_ptrs = d_addr, gpu_src_sizes = d_size)
+            else:
+                partner_channel.send(2,3, device_data)
+
+            if addr_optimization:
+                partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size)
+                partner_channel.send(gpu_src_ptrs = d_addr, gpu_src_sizes = d_size)
+            else:
+                partner_channel.recv(device_data)
+                partner_channel.send(device_data)
+
+
+        self.reduce(done_future)
+
+class ArrMap(ArrayMap):
+    def procNum(self, index):
+        return index[0] % 2
+
+def main(args):
+    # if this is not a cuda-aware build,
+    # vacuously pass the test
+    if not charm.CkCudaEnabled():
+        print("WARNING: Charm4Py was not build with CUDA-enabled Charm++. "
+              "GPU-Direct functionality will not be tested"
+              )
+        charm.exit(0)
+
+    peMap = Group(ArrMap)
+    chares = Array(A, 2, args=[8192], map = peMap)
+    done_fut = Future()
+    chares.run(done_fut, addr_optimization = False)
+    done_fut.get()
+    charm.exit(0)
+
+charm.start(main)

From 7ee684e9d62942d3c7ecdd884c8c3864011ad3d9 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 22 Apr 2021 14:32:44 -0400
Subject: [PATCH 091/107] flake8 compliance

---
 tests/cuda/single_array.py | 53 ++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py
index f720c439..526aaf92 100644
--- a/tests/cuda/single_array.py
+++ b/tests/cuda/single_array.py
@@ -1,5 +1,4 @@
 from charm4py import charm, Chare, Array, coro, Future, Channel, Group, ArrayMap
-import time
 import numpy as np
 from numba import cuda
 import array
@@ -9,9 +8,8 @@ class A(Chare):
     def __init__(self, msg_size):
         self.msg_size = msg_size
 
-
     @coro
-    def run(self, done_future, addr_optimization = False):
+    def run(self, done_future, addr_optimization=False):
         partner = self.thisProxy[int(not self.thisIndex[0])]
         partner_channel = Channel(self, partner)
 
@@ -23,8 +21,6 @@ def run(self, done_future, addr_optimization = False):
         d_addr[0] = device_data.__cuda_array_interface__['data'][0]
         d_size[0] = device_data.nbytes
 
-        my_stream = cuda.stream()
-
         if self.thisIndex[0]:
             host_data = np.zeros(self.msg_size, dtype='int8')
             host_data.fill(5)
@@ -32,11 +28,13 @@ def run(self, done_future, addr_optimization = False):
             if addr_optimization:
                 partner_channel.send(1, 2, "hello",
                                      np.ones(self.msg_size, dtype='int8'),
-                                     gpu_src_ptrs = d_addr, gpu_src_sizes = d_size
+                                     gpu_src_ptrs=d_addr, gpu_src_sizes=d_size
                                      )
-                p_data = partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size)
+                p_data = partner_channel.recv(post_buf_addresses=d_addr,
+                                              post_buf_sizes=d_size
+                                              )
             else:
-                partner_channel.send(1,2, "hello",
+                partner_channel.send(1, 2, "hello",
                                      device_data,
                                      np.ones(self.msg_size, dtype='int8')
                                      )
@@ -47,8 +45,10 @@ def run(self, done_future, addr_optimization = False):
             assert np.array_equal(h_ary, host_data)
 
             if addr_optimization:
-                partner_channel.send(gpu_src_ptrs = d_addr, gpu_src_sizes = d_size)
-                partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size)
+                partner_channel.send(gpu_src_ptrs=d_addr, gpu_src_sizes=d_size)
+                partner_channel.recv(post_buf_addresses=d_addr,
+                                     post_buf_sizes=d_size
+                                     )
             else:
                 partner_channel.send(device_data)
                 partner_channel.recv(device_data)
@@ -56,9 +56,10 @@ def run(self, done_future, addr_optimization = False):
             h_ary = device_data.copy_to_host()
             assert np.array_equal(h_ary, host_data)
         else:
-            h_data = np.ones(self.msg_size, dtype='int8')
             if addr_optimization:
-                p_data = partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size)
+                p_data = partner_channel.recv(post_buf_addresses=d_addr,
+                                              post_buf_sizes=d_size
+                                              )
             else:
                 p_data = partner_channel.recv(device_data)
             p_data, p_host_arr = p_data[0:-1], p_data[-1]
@@ -67,28 +68,35 @@ def run(self, done_future, addr_optimization = False):
             compare = np.zeros(self.msg_size, dtype='int8')
             compare.fill(5)
             assert np.array_equal(recvd, compare)
-            assert np.array_equal(np.ones(self.msg_size, dtype='int8'), p_host_arr)
+            assert np.array_equal(np.ones(self.msg_size, dtype='int8'),
+                                  p_host_arr
+                                  )
             assert p_data == (1, 2, "hello")
 
             if addr_optimization:
-                partner_channel.send(2,3, gpu_src_ptrs = d_addr, gpu_src_sizes = d_size)
+                partner_channel.send(2, 3, gpu_src_ptrs=d_addr,
+                                     gpu_src_sizes=d_size
+                                     )
             else:
-                partner_channel.send(2,3, device_data)
+                partner_channel.send(2, 3, device_data)
 
             if addr_optimization:
-                partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size)
-                partner_channel.send(gpu_src_ptrs = d_addr, gpu_src_sizes = d_size)
+                partner_channel.recv(post_buf_addresses=d_addr,
+                                     post_buf_sizes=d_size
+                                     )
+                partner_channel.send(gpu_src_ptrs=d_addr, gpu_src_sizes=d_size)
             else:
                 partner_channel.recv(device_data)
                 partner_channel.send(device_data)
 
-
         self.reduce(done_future)
 
+
 class ArrMap(ArrayMap):
     def procNum(self, index):
         return index[0] % 2
 
+
 def main(args):
     # if this is not a cuda-aware build,
     # vacuously pass the test
@@ -99,10 +107,15 @@ def main(args):
         charm.exit(0)
 
     peMap = Group(ArrMap)
-    chares = Array(A, 2, args=[8192], map = peMap)
+    chares = Array(A, 2, args=[8192], map=peMap)
+    done_fut = Future()
+    chares.run(done_fut, addr_optimization=False)
+    done_fut.get()
+
     done_fut = Future()
-    chares.run(done_fut, addr_optimization = False)
+    chares.run(done_fut, addr_optimization=True)
     done_fut.get()
     charm.exit(0)
 
+
 charm.start(main)

From 4f84baaa66f6b97636694781247c55511b7523e0 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 22 Apr 2021 15:27:34 -0400
Subject: [PATCH 092/107] fixed hangup when multiple device arrays are sent

---
 charm4py/charm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index 71df707f..afc67a34 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -357,7 +357,8 @@ def unpackMsg(self, msg, dcopy_start, dest_obj):
         return header, args
 
     def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs):
-        return_fut = self.Future(len(post_buffers))
+        # this future will only be satisfied when all buffers have been received
+        return_fut = self.Future()
         post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers]
         post_buf_sizes = [getDeviceDataSizeInBytes(buf) for buf in post_buffers]
         if not stream_ptrs:
@@ -366,7 +367,8 @@ def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs):
         return return_fut
 
     def getGPUDirectDataFromAddresses(self, post_buf_ptrs, post_buf_sizes, remote_bufs, stream_ptrs):
-        return_fut = self.Future(len(post_buf_ptrs))
+        # this future will only be satisfied when all buffers have been received
+        return_fut = self.Future()
         if not stream_ptrs:
             stream_ptrs = array.array('L', [0] * len(post_buf_ptrs))
         self.lib.getGPUDirectDataFromAddresses(post_buf_ptrs, post_buf_sizes, remote_bufs, stream_ptrs, return_fut)

From 49c82a19e3faaf35a70b140e600ba1a7a185b59b Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 22 Apr 2021 15:46:34 -0400
Subject: [PATCH 093/107] remove comment

---
 tests/cuda/single_array.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py
index 526aaf92..de0606c3 100644
--- a/tests/cuda/single_array.py
+++ b/tests/cuda/single_array.py
@@ -14,7 +14,7 @@ def run(self, done_future, addr_optimization=False):
         partner_channel = Channel(self, partner)
 
         device_data = cuda.device_array(self.msg_size, dtype='int8')
-        # if addr_optimization:
+
         d_addr = array.array('L', [0])
         d_size = array.array('L', [0])
 

From f9ffaf665b60ed0185e147e3be9fc90f9de7e129 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 22 Apr 2021 15:46:53 -0400
Subject: [PATCH 094/107] change != None to is not None

---
 charm4py/channel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index 22e29efb..15a407d1 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -79,10 +79,11 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None,
                     ret = ret[0]
             else:
                 gpu_recv_bufs = ret
+
             assert len(post_buffers) == len(gpu_recv_bufs)
             recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
             recv_future.get()
-        elif post_buf_addresses != None:
+        elif post_buf_addresses is not None:
             if isinstance(ret, tuple):
                 gpu_recv_bufs = ret[-1]
                 ret = ret[0:-1]

From 46851e68a96dfa69e95cfc354bc4f39a6979790b Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 22 Apr 2021 15:47:35 -0400
Subject: [PATCH 095/107] tests for multiple arrays

---
 tests/cuda/multi_array.py | 85 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 tests/cuda/multi_array.py

diff --git a/tests/cuda/multi_array.py b/tests/cuda/multi_array.py
new file mode 100644
index 00000000..4e338b4e
--- /dev/null
+++ b/tests/cuda/multi_array.py
@@ -0,0 +1,85 @@
+from charm4py import charm, Chare, Array, coro, Future, Channel, Group, ArrayMap
+import numpy as np
+from numba import cuda
+import array
+
+
+class A(Chare):
+    def __init__(self, msg_size):
+        self.msg_size = msg_size
+
+    @coro
+    def run(self, done_future, addr_optimization=False):
+        partner = self.thisProxy[int(not self.thisIndex[0])]
+        partner_channel = Channel(self, partner)
+
+        device_data = cuda.device_array(self.msg_size, dtype='int8')
+        device_data2 = cuda.device_array(self.msg_size, dtype='int8')
+        # if addr_optimization:
+        d_addr = array.array('L', [0, 0])
+        d_size = array.array('L', [0, 0])
+
+        d_addr[0] = device_data.__cuda_array_interface__['data'][0]
+        d_addr[1] = device_data2.__cuda_array_interface__['data'][0]
+
+        d_size[0] = device_data.nbytes
+        d_size[1] = device_data2.nbytes
+
+        host_array = np.array(self.msg_size, dtype='int32')
+        host_array.fill(42)
+
+        if self.thisIndex[0]:
+            h1 = np.ones(self.msg_size, dtype='int8')
+            h2 = np.zeros(self.msg_size, dtype='int8')
+            device_data.copy_to_device(h1)
+            device_data2.copy_to_device(h2)
+            if addr_optimization:
+                partner_channel.send(20, host_array, gpu_src_ptrs=d_addr,
+                                     gpu_src_sizes=d_size
+                                     )
+            else:
+                partner_channel.send(20, host_array, device_data, device_data2)
+        else:
+            if addr_optimization:
+                f, g = partner_channel.recv(device_data, device_data2)
+            else:
+                f, g = partner_channel.recv(post_buf_addresses=d_addr,
+                                            post_buf_sizes=d_addr
+                                            )
+            h1 = device_data.copy_to_host()
+            h2 = device_data2.copy_to_host()
+
+            assert f == 20
+            assert np.array_equal(host_array, g)
+            assert np.array_equal(h1, np.ones(self.msg_size, dtype='int8'))
+            assert np.array_equal(h2, np.zeros(self.msg_size, dtype='int8'))
+        self.reduce(done_future)
+
+
+class ArrMap(ArrayMap):
+    def procNum(self, index):
+        return index[0] % 2
+
+
+def main(args):
+    # if this is not a cuda-aware build,
+    # vacuously pass the test
+    if not charm.CkCudaEnabled():
+        print("WARNING: Charm4Py was not build with CUDA-enabled Charm++. "
+              "GPU-Direct functionality will not be tested"
+              )
+        charm.exit(0)
+
+    peMap = Group(ArrMap)
+    chares = Array(A, 2, args=[8192], map=peMap)
+    done_fut = Future()
+    chares.run(done_fut, addr_optimization=False)
+    done_fut.get()
+
+    done_fut = Future()
+    chares.run(done_fut, addr_optimization=True)
+    done_fut.get()
+    charm.exit(0)
+
+
+charm.start(main)

From e730942b0d0fea4dfd984980ad9776b567d51fb9 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Sat, 24 Apr 2021 15:08:44 -0400
Subject: [PATCH 096/107] sizes should be int to match charm++ side

---
 charm4py/charmlib/ccharm.pxd          |  4 +--
 charm4py/charmlib/charmlib_cython.pyx | 46 +++++++++++++--------------
 tests/cuda/multi_array.py             | 12 ++++---
 tests/cuda/single_array.py            |  4 +--
 4 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index 12db2492..3d7e40d5 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -79,12 +79,12 @@ cdef extern from "charm.h":
                                       int epIdx, int num_bufs, char **bufs,
                                       int *buf_sizes,
                                       long *devBufPtrs,
-                                      long *devBufSizesInBytes,
+                                      int *devBufSizesInBytes,
                                       long *streamPtrs, int numDevBufs
                                      );
 
 
-    void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int));
+    void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, int*, void *, int, char*, int));
     void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes,
                             void *remoteBufInfo, void *streamPtrs, int futureId);
 
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 58951fb1..b54044ff 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -309,7 +309,7 @@ cdef c_type_table_typecodes = [None] * 13
 cdef int c_type_table_sizes[13]
 cdef int[SECTION_MAX_BFACTOR] section_children
 cdef long[NUM_DCOPY_BUFS] gpu_direct_device_ptrs
-cdef long[NUM_DCOPY_BUFS] gpu_direct_buff_sizes
+cdef int[NUM_DCOPY_BUFS] gpu_direct_buff_sizes
 cdef long[NUM_DCOPY_BUFS] gpu_direct_stream_ptrs
 
 cdef object charm
@@ -485,16 +485,16 @@ class CharmLib(object):
     else:
       memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers)
 
-      send_bufs[0] = <char*>msg0
-      send_buf_sizes[0] = <int>len(msg0)
-      CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep,
-                                   cur_buf, send_bufs, send_buf_sizes,
-                                   gpu_direct_device_ptrs,
-                                   gpu_direct_buff_sizes,
-                                   gpu_direct_stream_ptrs,
-                                   num_direct_buffers
-                                  )
-      cur_buf = 1
+    send_bufs[0] = <char*>msg0
+    send_buf_sizes[0] = <int>len(msg0)
+    CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep,
+                                 cur_buf, send_bufs, send_buf_sizes,
+                                 gpu_direct_device_ptrs,
+                                 gpu_direct_buff_sizes,
+                                 gpu_direct_stream_ptrs,
+                                 num_direct_buffers
+                                )
+    cur_buf = 1
     gpu_direct_buf_idx = 0
 
   def CkArraySendWithDeviceDataFromPointers(self, int array_id, index not None, int ep,
@@ -515,16 +515,16 @@ class CharmLib(object):
     else:
       memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs)
 
-      send_bufs[0] = <char*>msg0
-      send_buf_sizes[0] = <int>len(msg0)
-      CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep,
-                                   cur_buf, send_bufs, send_buf_sizes,
-                                   <long*>gpu_src_ptrs.data.as_voidptr,
-                                   <long*>gpu_src_sizes.data.as_voidptr,
-                                   gpu_direct_stream_ptrs,
-                                   num_bufs
-                                  )
-      cur_buf = 1
+    send_bufs[0] = <char*>msg0
+    send_buf_sizes[0] = <int>len(msg0)
+    CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep,
+                                 cur_buf, send_bufs, send_buf_sizes,
+                                 <long*>gpu_src_ptrs.data.as_voidptr,
+                                 <int*>gpu_src_sizes.data.as_voidptr,
+                                 gpu_direct_stream_ptrs,
+                                 num_bufs
+                                 )
+    cur_buf = 1
     gpu_direct_buf_idx = 0
 
   def CkCudaEnabled(self):
@@ -933,8 +933,8 @@ class CharmLib(object):
   def getGPUDirectData(self, list post_buf_data, list post_buf_sizes, array.array remote_bufs, list stream_ptrs, return_fut):
     cdef int num_buffers = len(post_buf_data)
     cdef int future_id = return_fut.fid
-    cdef array.array int_array_template = array.array('i', [])
     cdef array.array long_array_template = array.array('L', [])
+    cdef array.array int_array_template = array.array('i', [])
     cdef array.array recv_buf_sizes
     cdef array.array recv_buf_ptrs
     # pointers from the remote that we will be issuing Rgets for
@@ -1034,7 +1034,7 @@ cdef void recvArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int msgSize,
     charm.handleGeneralError()
 
 cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numDevBuffs,
-                           long *devBufSizes, void *devBufs, int msgSize,
+                           int *devBufSizes, void *devBufs, int msgSize,
                            char *msg, int dcopy_start):
 
     cdef int idx = 0
diff --git a/tests/cuda/multi_array.py b/tests/cuda/multi_array.py
index 4e338b4e..4dc26c74 100644
--- a/tests/cuda/multi_array.py
+++ b/tests/cuda/multi_array.py
@@ -17,7 +17,7 @@ def run(self, done_future, addr_optimization=False):
         device_data2 = cuda.device_array(self.msg_size, dtype='int8')
         # if addr_optimization:
         d_addr = array.array('L', [0, 0])
-        d_size = array.array('L', [0, 0])
+        d_size = array.array('i', [0, 0])
 
         d_addr[0] = device_data.__cuda_array_interface__['data'][0]
         d_addr[1] = device_data2.__cuda_array_interface__['data'][0]
@@ -37,15 +37,17 @@ def run(self, done_future, addr_optimization=False):
                 partner_channel.send(20, host_array, gpu_src_ptrs=d_addr,
                                      gpu_src_sizes=d_size
                                      )
+                partner_channel.recv()
             else:
                 partner_channel.send(20, host_array, device_data, device_data2)
         else:
             if addr_optimization:
-                f, g = partner_channel.recv(device_data, device_data2)
-            else:
                 f, g = partner_channel.recv(post_buf_addresses=d_addr,
-                                            post_buf_sizes=d_addr
+                                            post_buf_sizes=d_size
                                             )
+            else:
+                f, g = partner_channel.recv(device_data, device_data2)
+            partner_channel.send(1)
             h1 = device_data.copy_to_host()
             h2 = device_data2.copy_to_host()
 
@@ -71,7 +73,7 @@ def main(args):
         charm.exit(0)
 
     peMap = Group(ArrMap)
-    chares = Array(A, 2, args=[8192], map=peMap)
+    chares = Array(A, 2, args=[(1<<30)], map=peMap)
     done_fut = Future()
     chares.run(done_fut, addr_optimization=False)
     done_fut.get()
diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py
index de0606c3..323daa88 100644
--- a/tests/cuda/single_array.py
+++ b/tests/cuda/single_array.py
@@ -16,7 +16,7 @@ def run(self, done_future, addr_optimization=False):
         device_data = cuda.device_array(self.msg_size, dtype='int8')
 
         d_addr = array.array('L', [0])
-        d_size = array.array('L', [0])
+        d_size = array.array('i', [0])
 
         d_addr[0] = device_data.__cuda_array_interface__['data'][0]
         d_size[0] = device_data.nbytes
@@ -107,7 +107,7 @@ def main(args):
         charm.exit(0)
 
     peMap = Group(ArrMap)
-    chares = Array(A, 2, args=[8192], map=peMap)
+    chares = Array(A, 2, args=[1<<20], map=peMap)
     done_fut = Future()
     chares.run(done_fut, addr_optimization=False)
     done_fut.get()

From 491de28acf898402fcb4de7875efba993f422032 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 26 Apr 2021 11:32:20 -0400
Subject: [PATCH 097/107] streams supported at charm4py layer

---
 charm4py/charmlib/charmlib_cython.pyx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index b54044ff..8acdbd53 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -463,7 +463,7 @@ class CharmLib(object):
       cur_buf = 1
 
   def CkArraySendWithDeviceData(self, int array_id, index not None, int ep,
-                                msg not None, list stream_ptrs):
+                                msg not None, stream_ptrs):
 
     global gpu_direct_buf_idx
     cdef int i = 0
@@ -479,10 +479,10 @@ class CharmLib(object):
     global gpu_direct_stream_ptrs
     global cur_buf
 
-    if stream_ptrs:
+    if stream_ptrs and isinstance(stream_ptrs, list):
       for i in range(num_direct_buffers):
         gpu_direct_stream_ptrs[i] = stream_ptrs[i]
-    else:
+    elif not stream_ptrs:
       memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers)
 
     send_bufs[0] = <char*>msg0
@@ -500,7 +500,7 @@ class CharmLib(object):
   def CkArraySendWithDeviceDataFromPointers(self, int array_id, index not None, int ep,
                                                 msg not None, array.array gpu_src_ptrs,
                                                 array.array gpu_src_sizes,
-                                                list stream_ptrs, int num_bufs):
+                                                stream_ptrs, int num_bufs):
 
     cdef int i = 0
     cdef int ndims = len(index)
@@ -509,7 +509,7 @@ class CharmLib(object):
     msg0, dcopy = msg
     dcopy = None
 
-    if stream_ptrs:
+    if stream_ptrs and isinstance(stream_ptrs, list):
       for i in range(num_bufs):
         gpu_direct_stream_ptrs[i] = stream_ptrs[i]
     else:

From fbcef04d26578fc530fc3b41f6137ad6a35895c9 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 26 Apr 2021 11:32:31 -0400
Subject: [PATCH 098/107] use streams when provided

---
 tests/cuda/single_array.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py
index 323daa88..e53276f0 100644
--- a/tests/cuda/single_array.py
+++ b/tests/cuda/single_array.py
@@ -21,6 +21,9 @@ def run(self, done_future, addr_optimization=False):
         d_addr[0] = device_data.__cuda_array_interface__['data'][0]
         d_size[0] = device_data.nbytes
 
+        my_stream = cuda.stream()
+        stream_addr = array.array('L', [my_stream.handle.value])
+
         if self.thisIndex[0]:
             host_data = np.zeros(self.msg_size, dtype='int8')
             host_data.fill(5)
@@ -28,15 +31,18 @@ def run(self, done_future, addr_optimization=False):
             if addr_optimization:
                 partner_channel.send(1, 2, "hello",
                                      np.ones(self.msg_size, dtype='int8'),
-                                     gpu_src_ptrs=d_addr, gpu_src_sizes=d_size
+                                     gpu_src_ptrs=d_addr, gpu_src_sizes=d_size,
+                                     stream_ptrs=stream_addr
                                      )
                 p_data = partner_channel.recv(post_buf_addresses=d_addr,
-                                              post_buf_sizes=d_size
+                                              post_buf_sizes=d_size,
+                                              stream_ptrs=stream_addr
                                               )
             else:
                 partner_channel.send(1, 2, "hello",
                                      device_data,
-                                     np.ones(self.msg_size, dtype='int8')
+                                     np.ones(self.msg_size, dtype='int8'),
+                                     stream_ptrs=stream_addr
                                      )
                 p_data = partner_channel.recv(device_data)
 

From f7d9083f75c079d156e8d66abddd2c9ad80c0698 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Mon, 26 Apr 2021 12:44:43 -0400
Subject: [PATCH 099/107] device send function array-specific

---
 charm4py/charmlib/ccharm.pxd          |  2 +-
 charm4py/charmlib/charmlib_cython.pyx | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index 3d7e40d5..902d2127 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -75,7 +75,7 @@ cdef extern from "charm.h":
     # TODO: Organize these to place them near their related functions
     int CkCudaEnabled();
     int CUDAPointerOnDevice(const void *ptr);
-    void CkChareExtSendWithDeviceData(int aid, int *idx, int ndims,
+    void CkArrayExtSendWithDeviceData(int aid, int *idx, int ndims,
                                       int epIdx, int num_bufs, char **bufs,
                                       int *buf_sizes,
                                       long *devBufPtrs,
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 8acdbd53..a07a14b2 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -487,7 +487,7 @@ class CharmLib(object):
 
     send_bufs[0] = <char*>msg0
     send_buf_sizes[0] = <int>len(msg0)
-    CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep,
+    CkArrayExtSendWithDeviceData(array_id, c_index, ndims, ep,
                                  cur_buf, send_bufs, send_buf_sizes,
                                  gpu_direct_device_ptrs,
                                  gpu_direct_buff_sizes,
@@ -517,7 +517,7 @@ class CharmLib(object):
 
     send_bufs[0] = <char*>msg0
     send_buf_sizes[0] = <int>len(msg0)
-    CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep,
+    CkArrayExtSendWithDeviceData(array_id, c_index, ndims, ep,
                                  cur_buf, send_bufs, send_buf_sizes,
                                  <long*>gpu_src_ptrs.data.as_voidptr,
                                  <int*>gpu_src_sizes.data.as_voidptr,
@@ -787,7 +787,7 @@ class CharmLib(object):
     registerChareMsgRecvExtCallback(recvChareMsg)
     registerGroupMsgRecvExtCallback(recvGroupMsg)
     registerArrayMsgRecvExtCallback(recvArrayMsg)
-    registerArrayMsgGPUDirectRecvExtCallback(recvGPUDirectMsg)
+    registerArrayMsgGPUDirectRecvExtCallback(recvGPUDirectArrayMsg)
     registerArrayBcastRecvExtCallback(recvArrayBcast)
     registerArrayMapProcNumExtCallback(arrayMapProcNum)
     registerArrayElemJoinExtCallback(arrayElemJoin)
@@ -1033,9 +1033,9 @@ cdef void recvArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int msgSize,
   except:
     charm.handleGeneralError()
 
-cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numDevBuffs,
-                           int *devBufSizes, void *devBufs, int msgSize,
-                           char *msg, int dcopy_start):
+cdef void recvGPUDirectArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int numDevBuffs,
+                                int *devBufSizes, void *devBufs, int msgSize,
+                                char *msg, int dcopy_start):
 
     cdef int idx = 0
     try:
@@ -1047,6 +1047,7 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD
         # Add the buffer's address to the list
         devBufInfo[idx] = <long>devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx)
       recv_buffer.setMsg(msg, msgSize)
+      # TODO: Can this be the same for array and groups?
       charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start)
 
     except:

From aae2ff16ce562a85ce02af2fc8ab50bf6a914bb6 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Tue, 18 May 2021 11:26:08 -0400
Subject: [PATCH 100/107] let CkArraySendWithDeviceDataFromPointers determine
 number of buffers

---
 charm4py/chare.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/charm4py/chare.py b/charm4py/chare.py
index 35d53bd8..49f15f22 100644
--- a/charm4py/chare.py
+++ b/charm4py/chare.py
@@ -768,9 +768,10 @@ def proxy_entry_method(proxy, *args, **kwargs):
                     charm.CkArraySendWithDeviceDataFromPointers(aid, elemIdx, ep,
                                                                 msg, kwargs['gpu_src_ptrs'],
                                                                 kwargs['gpu_src_sizes'],
-                                                                stream_ptrs,
-                                                                len(kwargs['gpu_src_ptrs'])
-                                                    )
+                                                                stream_ptrs
+                                                                )
+
+
 
             else:
                 charm.CkArraySend(aid, elemIdx, ep, msg)

From abf27e35febf859ae118a119bc85b3df308361d8 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Tue, 18 May 2021 11:27:04 -0400
Subject: [PATCH 101/107] add general slower case for getting gpu
 pointers/sizes from memoryviews

---
 charm4py/charm.py                     | 25 ++++++++++++++++-
 charm4py/charmlib/charmlib_cython.pyx | 40 ++++++++++++++++++++++++---
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/charm4py/charm.py b/charm4py/charm.py
index afc67a34..64085ef3 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -133,7 +133,8 @@ def __init__(self):
         self.CkGroupSend = self.lib.CkGroupSend
         self.CkArraySend = self.lib.CkArraySend
         self.CkArraySendWithDeviceData = self.lib.CkArraySendWithDeviceData
-        self.CkArraySendWithDeviceDataFromPointers = self.lib.CkArraySendWithDeviceDataFromPointers
+        self.CkArraySendWithDeviceDataFromPointersArray = self.lib.CkArraySendWithDeviceDataFromPointersArray
+        self.CkArraySendWithDeviceDataFromPointersOther = self.lib.CkArraySendWithDeviceDataFromPointersOther
         self.CkCudaEnabled = self.lib.CkCudaEnabled
         self.reducers = reduction.ReducerContainer(self)
         self.redMgr = reduction.ReductionManager(self, self.reducers)
@@ -374,6 +375,28 @@ def getGPUDirectDataFromAddresses(self, post_buf_ptrs, post_buf_sizes, remote_bu
         self.lib.getGPUDirectDataFromAddresses(post_buf_ptrs, post_buf_sizes, remote_bufs, stream_ptrs, return_fut)
         return return_fut
 
+    def CkArraySendWithDeviceDataFromPointers(self, array_id, index, ep,
+                                              msg, gpu_src_ptrs,
+                                              gpu_src_sizes,
+                                              stream_ptrs
+                                              ):
+        if isinstance(gpu_src_ptrs, array.array):
+            assert isinstance(gpu_src_sizes, array.array), \
+                "GPU source pointers and sizes must be of the same type."
+            self.CkArraySendWithDeviceDataFromPointersArray(array_id, index, ep,
+                                                            msg, gpu_src_ptrs,
+                                                            gpu_src_sizes,
+                                                            stream_ptrs,
+                                                            len(gpu_src_ptrs)
+                                                            )
+        else:
+            self.CkArraySendWithDeviceDataFromPointersOther(array_id, index, ep,
+                                                            msg, gpu_src_ptrs,
+                                                            gpu_src_sizes,
+                                                            stream_ptrs,
+                                                            len(gpu_src_ptrs)
+                                                            )
+
     # deposit value of one of the futures that was created on this PE
     def _future_deposit_result(self, fid, result=None):
         self.threadMgr.depositFuture(fid, result)
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index a07a14b2..1f90703d 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -497,10 +497,10 @@ class CharmLib(object):
     cur_buf = 1
     gpu_direct_buf_idx = 0
 
-  def CkArraySendWithDeviceDataFromPointers(self, int array_id, index not None, int ep,
-                                                msg not None, array.array gpu_src_ptrs,
-                                                array.array gpu_src_sizes,
-                                                stream_ptrs, int num_bufs):
+  def CkArraySendWithDeviceDataFromPointersArray(self, int array_id, index not None, int ep,
+                                                 msg not None, array.array gpu_src_ptrs,
+                                                 array.array gpu_src_sizes,
+                                                 stream_ptrs, int num_bufs):
 
     cdef int i = 0
     cdef int ndims = len(index)
@@ -527,6 +527,38 @@ class CharmLib(object):
     cur_buf = 1
     gpu_direct_buf_idx = 0
 
+  def CkArraySendWithDeviceDataFromPointersOther(self, int array_id, index not None, int ep,
+                                                 msg not None, gpu_src_ptrs,
+                                                 gpu_src_sizes,
+                                                 stream_ptrs, int num_bufs
+                                                 ):
+    cdef int i = 0
+    cdef int ndims = len(index)
+    global cur_buf
+    for i in range(ndims): c_index[i] = index[i]
+    msg0, dcopy = msg
+    dcopy = None
+    cdef unsigned long[:] gpu_addresses = gpu_src_ptrs
+    cdef int[:] gpu_buffer_sizes = gpu_src_sizes
+
+    if stream_ptrs and isinstance(stream_ptrs, list):
+      for i in range(num_bufs):
+        gpu_direct_stream_ptrs[i] = stream_ptrs[i]
+    else:
+      memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs)
+
+    send_bufs[0] = <char*>msg0
+    send_buf_sizes[0] = <int>len(msg0)
+    CkArrayExtSendWithDeviceData(array_id, c_index, ndims, ep,
+                                 cur_buf, send_bufs, send_buf_sizes,
+                                 <long*>&gpu_addresses[0],
+                                 &gpu_buffer_sizes[0],
+                                 gpu_direct_stream_ptrs,
+                                 num_bufs
+                                 )
+    cur_buf = 1
+    gpu_direct_buf_idx = 0
+
   def CkCudaEnabled(self):
     return bool(CkCudaEnabled())
 

From 8188d2955f93505817cc7f3162bb6cdc608b7a35 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 19 May 2021 15:13:49 -0400
Subject: [PATCH 102/107] use Charm++ functionality for GPU-direct group
 support

---
 charm4py/charmlib/ccharm.pxd          |   6 ++
 charm4py/charmlib/charmlib_cython.pyx | 108 +++++++++++++++++++++++++-
 2 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
index 902d2127..2de91517 100644
--- a/charm4py/charmlib/ccharm.pxd
+++ b/charm4py/charmlib/ccharm.pxd
@@ -53,6 +53,7 @@ cdef extern from "charm.h":
     void registerReadOnlyRecvExtCallback(void (*cb)(int, char*));
     void registerChareMsgRecvExtCallback(void (*cb)(int, void*, int, int, char*, int));
     void registerGroupMsgRecvExtCallback(void (*cb)(int, int, int, char *, int));
+    void registerGroupMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int, int *, void *, int, char *, int));
     void registerArrayMsgRecvExtCallback(void (*cb)(int, int, int *, int, int, char *, int));
     void registerArrayBcastRecvExtCallback(void (*cb)(int, int, int, int, int *, int, int, char *, int));
     void registerArrayElemLeaveExtCallback(int (*cb)(int, int, int *, char**, int));
@@ -82,6 +83,11 @@ cdef extern from "charm.h":
                                       int *devBufSizesInBytes,
                                       long *streamPtrs, int numDevBufs
                                      );
+    void CkGroupExtSendWithDeviceData(int gid, int pe, int epIdx, int num_bufs, char **bufs,
+                                      int *buf_sizes, long *devBufPtrs,
+                                      int *devBufSizesInBytes,
+                                      long *streamPtrs, int numDevBufs
+                                      );
 
 
     void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, int*, void *, int, char*, int));
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
index 1f90703d..0183ea10 100644
--- a/charm4py/charmlib/charmlib_cython.pyx
+++ b/charm4py/charmlib/charmlib_cython.pyx
@@ -462,6 +462,93 @@ class CharmLib(object):
       CkGroupExtSend_multi(group_id, num_pes, section_children, ep, cur_buf, send_bufs, send_buf_sizes)
       cur_buf = 1
 
+  def CkGroupSendWithDeviceData(self, int group_id, int index, int ep,
+                                msg not None, stream_ptrs):
+    global gpu_direct_buf_idx
+    cdef int i = 0
+    msg0, dcopy = msg
+    dcopy = None
+    cdef int num_direct_buffers = gpu_direct_buf_idx
+    # TODO: Message on assertion failure
+    assert num_direct_buffers <= NUM_DCOPY_BUFS
+    global gpu_direct_device_ptrs
+    global gpu_direct_stream_ptrs
+    global cur_buf
+
+    if stream_ptrs and isinstance(stream_ptrs, list):
+      for i in range(num_direct_buffers):
+        gpu_direct_stream_ptrs[i] = stream_ptrs[i]
+    elif not stream_ptrs:
+      memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers)
+
+    send_bufs[0] = <char*>msg0
+    send_buf_sizes[0] = <int>len(msg0)
+    CkGroupExtSendWithDeviceData(group_id, index, ep,
+                                 cur_buf, send_bufs, send_buf_sizes,
+                                 gpu_direct_device_ptrs,
+                                 gpu_direct_buff_sizes,
+                                 gpu_direct_stream_ptrs,
+                                 num_direct_buffers
+                                )
+    cur_buf = 1
+    gpu_direct_buf_idx = 0
+
+  def CkGroupSendWithDeviceDataFromPointersArray(self, int gid, int index, int ep,
+                                                 msg not None, array.array gpu_src_ptrs,
+                                                 array.array gpu_src_sizes, stream_ptrs,
+                                                 num_bufs):
+    cdef int i = 0
+    global cur_buf
+    msg0, dcopy = msg
+    dcopy = None
+
+    if stream_ptrs and isinstance(stream_ptrs, list):
+      for i in range(num_bufs):
+        gpu_direct_stream_ptrs[i] = stream_ptrs[i]
+    else:
+      memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs)
+
+    send_bufs[0] = <char*>msg0
+    send_buf_sizes[0] = <int>len(msg0)
+    CkGroupExtSendWithDeviceData(gid, index, ep,
+                                 cur_buf, send_bufs, send_buf_sizes,
+                                 <long*>gpu_src_ptrs.data.as_voidptr,
+                                 <int*>gpu_src_sizes.data.as_voidptr,
+                                 gpu_direct_stream_ptrs,
+                                 num_bufs
+                                 )
+    cur_buf = 1
+    gpu_direct_buf_idx = 0
+
+  def CkGroupSendWithDeviceDataFromPointersOther(self, int gid, int index, int ep,
+                                                 msg not None, gpu_src_ptrs,
+                                                 gpu_src_sizes, stream_ptrs,
+                                                 num_bufs):
+    cdef int i = 0
+    global cur_buf
+    msg0, dcopy = msg
+    dcopy = None
+    cdef unsigned long[:] gpu_addresses = gpu_src_ptrs
+    cdef int[:] gpu_buffer_sizes = gpu_src_sizes
+
+    if stream_ptrs and isinstance(stream_ptrs, list):
+      for i in range(num_bufs):
+        gpu_direct_stream_ptrs[i] = stream_ptrs[i]
+    else:
+      memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs)
+
+    send_bufs[0] = <char*>msg0
+    send_buf_sizes[0] = <int>len(msg0)
+    CkGroupExtSendWithDeviceData(gid, index, ep,
+                                 cur_buf, send_bufs, send_buf_sizes,
+                                 <long*>&gpu_addresses[0],
+                                 &gpu_buffer_sizes[0],
+                                 gpu_direct_stream_ptrs,
+                                 num_bufs
+                                 )
+    cur_buf = 1
+    gpu_direct_buf_idx = 0
+
   def CkArraySendWithDeviceData(self, int array_id, index not None, int ep,
                                 msg not None, stream_ptrs):
 
@@ -818,6 +905,7 @@ class CharmLib(object):
     registerReadOnlyRecvExtCallback(recvReadOnly)
     registerChareMsgRecvExtCallback(recvChareMsg)
     registerGroupMsgRecvExtCallback(recvGroupMsg)
+    registerGroupMsgGPUDirectRecvExtCallback(recvGPUDirectGroupMsg)
     registerArrayMsgRecvExtCallback(recvArrayMsg)
     registerArrayMsgGPUDirectRecvExtCallback(recvGPUDirectArrayMsg)
     registerArrayBcastRecvExtCallback(recvArrayBcast)
@@ -1055,6 +1143,24 @@ cdef void recvGroupMsg(int gid, int ep, int msgSize, char *msg, int dcopy_start)
   except:
     charm.handleGeneralError()
 
+cdef void recvGPUDirectGroupMsg(int gid, int ep, int numDevBuffs,
+                                int *devBufSizes, void *devBufs, int msgSize,
+                                char *msg, int dcopy_start
+                                ):
+  try:
+    if PROFILING:
+      charm._precvtime = time.time()
+      charm.recordReceive(msgSize)
+    devBufInfo = array.array('L', [0] * numDevBuffs)
+    for idx in range(numDevBuffs):
+      # Add the buffer's address to the list
+      devBufInfo[idx] = <long>devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx)
+    recv_buffer.setMsg(msg, msgSize)
+    charm.recvGPUDirectGroupMsg(gid, ep, devBufInfo, recv_buffer, dcopy_start)
+  except:
+    charm.handleGeneralError()
+
+
 cdef void recvArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int msgSize, char *msg, int dcopy_start):
   try:
     if PROFILING:
@@ -1080,7 +1186,7 @@ cdef void recvGPUDirectArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int
         devBufInfo[idx] = <long>devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx)
       recv_buffer.setMsg(msg, msgSize)
       # TODO: Can this be the same for array and groups?
-      charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start)
+      charm.recvGPUDirectArrayMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start)
 
     except:
       charm.handleGeneralError()

From 895e619f05dba50641741cc3e6b1962357f4eff0 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 19 May 2021 15:14:21 -0400
Subject: [PATCH 103/107] Groups can now use GPU-direct functionality

---
 charm4py/chare.py | 27 +++++++++++++++++++++++++--
 charm4py/charm.py | 35 ++++++++++++++++++++++++++++++++---
 2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/charm4py/chare.py b/charm4py/chare.py
index 49f15f22..74aff874 100644
--- a/charm4py/chare.py
+++ b/charm4py/chare.py
@@ -461,6 +461,9 @@ def proxy_entry_method(proxy, *args, **kwargs):
             for i in range(num_args, argcount):
                 argname = argnames[i]
                 # first look for argument in kwargs
+                # TODO: Should stream_ptrs be skipped?
+                if argname in {'stream_ptrs', 'gpu_src_ptrs', 'gpu_src_sizes'}:
+                    continue
                 if argname in kwargs:
                     args.append(kwargs[argname])
                 else:
@@ -485,8 +488,28 @@ def proxy_entry_method(proxy, *args, **kwargs):
             gid = proxy.gid
             if Options.local_msg_optim and (elemIdx == charm._myPe) and (len(args) > 0):
                 destObj = charm.groups[gid]
-            msg = charm.packMsg(destObj, args, header)
-            charm.CkGroupSend(gid, elemIdx, ep, msg)
+            should_pack_gpu = True
+            if 'gpu_src_ptrs' in kwargs:
+                should_pack_gpu = False
+            msg = charm.packMsg(destObj, args, header, pack_gpu=should_pack_gpu)
+            if msg[1] or not should_pack_gpu:
+                if 'stream_ptrs' in kwargs:
+                    stream_ptrs = kwargs['stream_ptrs']
+                else:
+                    stream_ptrs = None
+                if should_pack_gpu:
+                    charm.CkGroupSendWithDeviceData(gid, elemIdx, ep,
+                                                    msg, stream_ptrs
+                                                    )
+                else:
+                    charm.CkGroupSendWithDeviceDataFromPointers(gid, elemIdx, ep,
+                                                                msg, kwargs['gpu_src_ptrs'],
+                                                                kwargs['gpu_src_sizes'],
+                                                                stream_ptrs
+                                                                )
+
+            else:
+                charm.CkGroupSend(gid, elemIdx, ep, msg)
         else:
             root, sid = proxy.section
             header[b'sid'] = sid
diff --git a/charm4py/charm.py b/charm4py/charm.py
index 64085ef3..4e626a9e 100644
--- a/charm4py/charm.py
+++ b/charm4py/charm.py
@@ -133,8 +133,11 @@ def __init__(self):
         self.CkGroupSend = self.lib.CkGroupSend
         self.CkArraySend = self.lib.CkArraySend
         self.CkArraySendWithDeviceData = self.lib.CkArraySendWithDeviceData
+        self.CkGroupSendWithDeviceData = self.lib.CkGroupSendWithDeviceData
         self.CkArraySendWithDeviceDataFromPointersArray = self.lib.CkArraySendWithDeviceDataFromPointersArray
         self.CkArraySendWithDeviceDataFromPointersOther = self.lib.CkArraySendWithDeviceDataFromPointersOther
+        self.CkGroupSendWithDeviceDataFromPointersArray = self.lib.CkGroupSendWithDeviceDataFromPointersArray
+        self.CkGroupSendWithDeviceDataFromPointersOther = self.lib.CkGroupSendWithDeviceDataFromPointersOther
         self.CkCudaEnabled = self.lib.CkCudaEnabled
         self.reducers = reduction.ReducerContainer(self)
         self.redMgr = reduction.ReductionManager(self, self.reducers)
@@ -321,15 +324,21 @@ def recvArrayMsg(self, aid, index, ep, msg, dcopy_start):
                 self.arrays[aid][index] = obj
                 em.run(obj, header, args)  # now call the user's array element __init__
 
-    def recvGPUDirectMsg(self, aid, index, ep,
-                         devBuf_ptrs, msg, dcopy_start
-                         ):
+    def recvGPUDirectArrayMsg(self, aid, index, ep,
+                              devBuf_ptrs, msg, dcopy_start
+                              ):
         obj = self.arrays[aid][index]
         header, args = self.unpackMsg(msg, dcopy_start, obj)
         args.append(devBuf_ptrs)
 
         self.invokeEntryMethod(obj, ep, header, args)
 
+    def recvGPUDirectGroupMsg(self, gid, ep, devBuf_ptrs, msg, dcopy_start):
+        obj = self.groups[gid]
+        header, args = self.unpackMsg(msg, dcopy_start, obj)
+        args.append(devBuf_ptrs)
+        self.invokeEntryMethod(obj, ep, header, args)
+
     def recvArrayBcast(self, aid, indexes, ep, msg, dcopy_start):
         header, args = self.unpackMsg(msg, dcopy_start, None)
         array = self.arrays[aid]
@@ -397,6 +406,26 @@ def CkArraySendWithDeviceDataFromPointers(self, array_id, index, ep,
                                                             len(gpu_src_ptrs)
                                                             )
 
+    def CkGroupSendWithDeviceDataFromPointers(self, gid, elemIdx, ep,
+                                              msg, gpu_src_ptrs, gpu_src_sizes,
+                                              stream_ptrs):
+        if isinstance(gpu_src_ptrs, array.array):
+            assert isinstance(gpu_src_sizes, array.array), \
+                "GPU source pointers and sizes must be of the same type."
+            self.CkGroupSendWithDeviceDataFromPointersArray(gid, elemIdx, ep, msg,
+                                                            gpu_src_ptrs,
+                                                            gpu_src_sizes,
+                                                            stream_ptrs,
+                                                            len(gpu_src_ptrs)
+                                                            )
+        else:
+            self.CkGroupSendWithDeviceDataFromPointersOther(gid, elemIdx, ep, msg,
+                                                            gpu_src_ptrs,
+                                                            gpu_src_sizes,
+                                                            stream_ptrs,
+                                                            len(gpu_src_ptrs)
+                                                            )
+
     # deposit value of one of the futures that was created on this PE
     def _future_deposit_result(self, fid, result=None):
         self.threadMgr.depositFuture(fid, result)

From 0abd4437b68be38dd4fe521fedb5930d9a1b9104 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Wed, 19 May 2021 15:14:36 -0400
Subject: [PATCH 104/107] test both arrays and groups

---
 tests/cuda/multi_array.py  | 17 +++++++++++++++--
 tests/cuda/single_array.py | 18 +++++++++++++++---
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/tests/cuda/multi_array.py b/tests/cuda/multi_array.py
index 4dc26c74..cb9d038d 100644
--- a/tests/cuda/multi_array.py
+++ b/tests/cuda/multi_array.py
@@ -7,10 +7,14 @@
 class A(Chare):
     def __init__(self, msg_size):
         self.msg_size = msg_size
+        if type(self.thisIndex) is tuple:
+            self.idx = int(self.thisIndex[0])
+        else:
+            self.idx = self.thisIndex
 
     @coro
     def run(self, done_future, addr_optimization=False):
-        partner = self.thisProxy[int(not self.thisIndex[0])]
+        partner = self.thisProxy[int(not self.idx)]
         partner_channel = Channel(self, partner)
 
         device_data = cuda.device_array(self.msg_size, dtype='int8')
@@ -28,7 +32,7 @@ def run(self, done_future, addr_optimization=False):
         host_array = np.array(self.msg_size, dtype='int32')
         host_array.fill(42)
 
-        if self.thisIndex[0]:
+        if self.idx:
             h1 = np.ones(self.msg_size, dtype='int8')
             h2 = np.zeros(self.msg_size, dtype='int8')
             device_data.copy_to_device(h1)
@@ -78,6 +82,15 @@ def main(args):
     chares.run(done_fut, addr_optimization=False)
     done_fut.get()
 
+    done_fut = Future()
+    chares.run(done_fut, addr_optimization=True)
+    done_fut.get()
+
+    chares = Group(A, args=[(1<<30)])
+    done_fut = Future()
+    chares.run(done_fut, addr_optimization=False)
+    done_fut.get()
+
     done_fut = Future()
     chares.run(done_fut, addr_optimization=True)
     done_fut.get()
diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py
index e53276f0..61836018 100644
--- a/tests/cuda/single_array.py
+++ b/tests/cuda/single_array.py
@@ -7,10 +7,13 @@
 class A(Chare):
     def __init__(self, msg_size):
         self.msg_size = msg_size
-
+        if type(self.thisIndex) is tuple:
+            self.idx = int(self.thisIndex[0])
+        else:
+            self.idx = self.thisIndex
     @coro
     def run(self, done_future, addr_optimization=False):
-        partner = self.thisProxy[int(not self.thisIndex[0])]
+        partner = self.thisProxy[int(not self.idx)]
         partner_channel = Channel(self, partner)
 
         device_data = cuda.device_array(self.msg_size, dtype='int8')
@@ -24,7 +27,7 @@ def run(self, done_future, addr_optimization=False):
         my_stream = cuda.stream()
         stream_addr = array.array('L', [my_stream.handle.value])
 
-        if self.thisIndex[0]:
+        if self.idx:
             host_data = np.zeros(self.msg_size, dtype='int8')
             host_data.fill(5)
             device_data.copy_to_device(host_data)
@@ -118,6 +121,15 @@ def main(args):
     chares.run(done_fut, addr_optimization=False)
     done_fut.get()
 
+    done_fut = Future()
+    chares.run(done_fut, addr_optimization=True)
+    done_fut.get()
+
+    chares = Group(A, args=[1<<20])
+    done_fut = Future()
+    chares.run(done_fut, addr_optimization=False)
+    done_fut.get()
+
     done_fut = Future()
     chares.run(done_fut, addr_optimization=True)
     done_fut.get()

From 53d35f293de9db1b9be8bd63d63a6c6e9a1b9466 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 20 May 2021 18:12:38 -0400
Subject: [PATCH 105/107] make post/src buffers more kwargs more general

---
 charm4py/channel.py | 12 +++++++-----
 charm4py/chare.py   | 16 ++++++++--------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/charm4py/channel.py b/charm4py/channel.py
index 15a407d1..c5c5eed7 100644
--- a/charm4py/channel.py
+++ b/charm4py/channel.py
@@ -62,7 +62,7 @@ def send(self, *msg, **kwargs):
         self.remote._channelRecv__(self.remote_port, self.send_seqno, *msg, **kwargs)
         self.send_seqno = (self.send_seqno + 1) % CHAN_BUF_SIZE
 
-    def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None, stream_ptrs = None):
+    def recv(self, *post_buffers, post_addresses = None, post_sizes = None, stream_ptrs = None):
         if self.recv_seqno in self.data:
             ret = self.data.pop(self.recv_seqno)
         else:
@@ -81,9 +81,10 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None,
                 gpu_recv_bufs = ret
 
             assert len(post_buffers) == len(gpu_recv_bufs)
+
             recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs)
             recv_future.get()
-        elif post_buf_addresses is not None:
+        elif post_addresses is not None:
             if isinstance(ret, tuple):
                 gpu_recv_bufs = ret[-1]
                 ret = ret[0:-1]
@@ -91,9 +92,10 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None,
                     ret = ret[0]
             else:
                 gpu_recv_bufs = ret
-            assert len(post_buf_addresses) == len(gpu_recv_bufs)
-            assert post_buf_sizes
-            recv_future = charm.getGPUDirectDataFromAddresses(post_buf_addresses, post_buf_sizes, gpu_recv_bufs, stream_ptrs)
+
+            assert len(post_addresses) == len(gpu_recv_bufs)
+            assert post_sizes
+            recv_future = charm.getGPUDirectDataFromAddresses(post_addresses, post_sizes, gpu_recv_bufs, stream_ptrs)
             recv_future.get()
 
 
diff --git a/charm4py/chare.py b/charm4py/chare.py
index 74aff874..4bfa6245 100644
--- a/charm4py/chare.py
+++ b/charm4py/chare.py
@@ -462,7 +462,7 @@ def proxy_entry_method(proxy, *args, **kwargs):
                 argname = argnames[i]
                 # first look for argument in kwargs
                 # TODO: Should stream_ptrs be skipped?
-                if argname in {'stream_ptrs', 'gpu_src_ptrs', 'gpu_src_sizes'}:
+                if argname in {'stream_ptrs', 'src_ptrs', 'src_sizes'}:
                     continue
                 if argname in kwargs:
                     args.append(kwargs[argname])
@@ -489,7 +489,7 @@ def proxy_entry_method(proxy, *args, **kwargs):
             if Options.local_msg_optim and (elemIdx == charm._myPe) and (len(args) > 0):
                 destObj = charm.groups[gid]
             should_pack_gpu = True
-            if 'gpu_src_ptrs' in kwargs:
+            if 'src_ptrs' in kwargs:
                 should_pack_gpu = False
             msg = charm.packMsg(destObj, args, header, pack_gpu=should_pack_gpu)
             if msg[1] or not should_pack_gpu:
@@ -503,8 +503,8 @@ def proxy_entry_method(proxy, *args, **kwargs):
                                                     )
                 else:
                     charm.CkGroupSendWithDeviceDataFromPointers(gid, elemIdx, ep,
-                                                                msg, kwargs['gpu_src_ptrs'],
-                                                                kwargs['gpu_src_sizes'],
+                                                                msg, kwargs['src_ptrs'],
+                                                                kwargs['src_sizes'],
                                                                 stream_ptrs
                                                                 )
 
@@ -745,7 +745,7 @@ def proxy_entry_method(proxy, *args, **kwargs):
                 argname = argnames[i]
                 # first look for argument in kwargs
                 # TODO: Should stream_ptrs be skipped?
-                if argname in {'stream_ptrs', 'gpu_src_ptrs', 'gpu_src_sizes'}:
+                if argname in {'stream_ptrs', 'src_ptrs', 'src_sizes'}:
                     continue
                 if argname in kwargs and argname:
                     args.append(kwargs[argname])
@@ -775,7 +775,7 @@ def proxy_entry_method(proxy, *args, **kwargs):
                 if elemIdx in array:
                     destObj = array[elemIdx]
             should_pack_gpu = True
-            if 'gpu_src_ptrs' in kwargs:
+            if 'src_ptrs' in kwargs:
                 should_pack_gpu = False
             msg = charm.packMsg(destObj, args, header, pack_gpu = should_pack_gpu)
             if msg[1] or not should_pack_gpu:
@@ -789,8 +789,8 @@ def proxy_entry_method(proxy, *args, **kwargs):
                                                     )
                 else:
                     charm.CkArraySendWithDeviceDataFromPointers(aid, elemIdx, ep,
-                                                                msg, kwargs['gpu_src_ptrs'],
-                                                                kwargs['gpu_src_sizes'],
+                                                                msg, kwargs['src_ptrs'],
+                                                                kwargs['src_sizes'],
                                                                 stream_ptrs
                                                                 )
 

From 52856ad2fdc952a9fa3b3e31490920b60d552d34 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 20 May 2021 18:13:02 -0400
Subject: [PATCH 106/107] update tests with new kwarg names

---
 tests/cuda/multi_array.py  |  8 ++++----
 tests/cuda/single_array.py | 26 +++++++++++++-------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/cuda/multi_array.py b/tests/cuda/multi_array.py
index cb9d038d..93c7876a 100644
--- a/tests/cuda/multi_array.py
+++ b/tests/cuda/multi_array.py
@@ -38,16 +38,16 @@ def run(self, done_future, addr_optimization=False):
             device_data.copy_to_device(h1)
             device_data2.copy_to_device(h2)
             if addr_optimization:
-                partner_channel.send(20, host_array, gpu_src_ptrs=d_addr,
-                                     gpu_src_sizes=d_size
+                partner_channel.send(20, host_array, src_ptrs=d_addr,
+                                     src_sizes=d_size
                                      )
                 partner_channel.recv()
             else:
                 partner_channel.send(20, host_array, device_data, device_data2)
         else:
             if addr_optimization:
-                f, g = partner_channel.recv(post_buf_addresses=d_addr,
-                                            post_buf_sizes=d_size
+                f, g = partner_channel.recv(post_addresses=d_addr,
+                                            post_sizes=d_size
                                             )
             else:
                 f, g = partner_channel.recv(device_data, device_data2)
diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py
index 61836018..1c3a3692 100644
--- a/tests/cuda/single_array.py
+++ b/tests/cuda/single_array.py
@@ -34,11 +34,11 @@ def run(self, done_future, addr_optimization=False):
             if addr_optimization:
                 partner_channel.send(1, 2, "hello",
                                      np.ones(self.msg_size, dtype='int8'),
-                                     gpu_src_ptrs=d_addr, gpu_src_sizes=d_size,
+                                     src_ptrs=d_addr, src_sizes=d_size,
                                      stream_ptrs=stream_addr
                                      )
-                p_data = partner_channel.recv(post_buf_addresses=d_addr,
-                                              post_buf_sizes=d_size,
+                p_data = partner_channel.recv(post_addresses=d_addr,
+                                              post_sizes=d_size,
                                               stream_ptrs=stream_addr
                                               )
             else:
@@ -54,9 +54,9 @@ def run(self, done_future, addr_optimization=False):
             assert np.array_equal(h_ary, host_data)
 
             if addr_optimization:
-                partner_channel.send(gpu_src_ptrs=d_addr, gpu_src_sizes=d_size)
-                partner_channel.recv(post_buf_addresses=d_addr,
-                                     post_buf_sizes=d_size
+                partner_channel.send(src_ptrs=d_addr, src_sizes=d_size)
+                partner_channel.recv(post_addresses=d_addr,
+                                     post_sizes=d_size
                                      )
             else:
                 partner_channel.send(device_data)
@@ -66,8 +66,8 @@ def run(self, done_future, addr_optimization=False):
             assert np.array_equal(h_ary, host_data)
         else:
             if addr_optimization:
-                p_data = partner_channel.recv(post_buf_addresses=d_addr,
-                                              post_buf_sizes=d_size
+                p_data = partner_channel.recv(post_addresses=d_addr,
+                                              post_sizes=d_size
                                               )
             else:
                 p_data = partner_channel.recv(device_data)
@@ -83,17 +83,17 @@ def run(self, done_future, addr_optimization=False):
             assert p_data == (1, 2, "hello")
 
             if addr_optimization:
-                partner_channel.send(2, 3, gpu_src_ptrs=d_addr,
-                                     gpu_src_sizes=d_size
+                partner_channel.send(2, 3, src_ptrs=d_addr,
+                                     src_sizes=d_size
                                      )
             else:
                 partner_channel.send(2, 3, device_data)
 
             if addr_optimization:
-                partner_channel.recv(post_buf_addresses=d_addr,
-                                     post_buf_sizes=d_size
+                partner_channel.recv(post_addresses=d_addr,
+                                     post_sizes=d_size
                                      )
-                partner_channel.send(gpu_src_ptrs=d_addr, gpu_src_sizes=d_size)
+                partner_channel.send(src_ptrs=d_addr, src_sizes=d_size)
             else:
                 partner_channel.recv(device_data)
                 partner_channel.send(device_data)

From ba3e95ce49a42d2226c6e4e74098f99109993568 Mon Sep 17 00:00:00 2001
From: Zane Fink <finkzane@gmail.com>
Date: Thu, 20 May 2021 18:30:38 -0400
Subject: [PATCH 107/107] update calls to match new API

---
 tests/benchmark/pingpong_gpu.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py
index 5bfafd6e..4a771794 100644
--- a/tests/benchmark/pingpong_gpu.py
+++ b/tests/benchmark/pingpong_gpu.py
@@ -58,9 +58,9 @@ def do_iteration(self, message_size, num_iters, done_future):
 
         if self.gpu_direct and USE_ADDRESS_OPTIMIZATION:
             d_data_recv_addr = array.array('L', [0])
-            d_data_recv_size = array.array('L', [0])
+            d_data_recv_size = array.array('i', [0])
             d_data_send_addr = array.array('L', [0])
-            d_data_send_size = array.array('L', [0])
+            d_data_send_size = array.array('i', [0])
 
             d_data_recv_addr[0] = d_data_recv.__cuda_array_interface__['data'][0]
             d_data_recv_size[0] = d_data_recv.nbytes
@@ -85,9 +85,9 @@ def do_iteration(self, message_size, num_iters, done_future):
                     charm.lib.CudaStreamSynchronize(stream_address)
                 else:
                     if USE_ADDRESS_OPTIMIZATION:
-                        partner_channel.send(gpu_src_ptrs = d_data_send_addr, gpu_src_sizes = d_data_send_size)
-                        partner_channel.recv(post_buf_addresses = d_data_recv_addr,
-                                             post_buf_sizes = d_data_recv_size
+                        partner_channel.send(src_ptrs = d_data_send_addr, src_sizes = d_data_send_size)
+                        partner_channel.recv(post_addresses = d_data_recv_addr,
+                                             post_sizes = d_data_recv_size
                                              )
                     else:
                         partner_channel.send(d_data_send)
@@ -103,10 +103,10 @@ def do_iteration(self, message_size, num_iters, done_future):
                     partner_channel.send(h_data_send)
                 else:
                     if USE_ADDRESS_OPTIMIZATION:
-                        partner_channel.recv(post_buf_addresses = d_data_recv_addr,
-                                             post_buf_sizes = d_data_recv_size
+                        partner_channel.recv(post_addresses = d_data_recv_addr,
+                                             post_sizes = d_data_recv_size
                                              )
-                        partner_channel.send(gpu_src_ptrs = d_data_send_addr, gpu_src_sizes = d_data_send_size)
+                        partner_channel.send(src_ptrs = d_data_send_addr, src_sizes = d_data_send_size)
                     else:
                         partner_channel.recv(d_data_recv)
                         partner_channel.send(d_data_send)