From d8ef62c018f9605ddbd3abc6b07979109a630182 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 28 Dec 2020 13:41:24 -0600 Subject: [PATCH 001/107] Start editing benchmark --- tests/benchmark/pingpong.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/benchmark/pingpong.py b/tests/benchmark/pingpong.py index cb6060c2..cb367f81 100644 --- a/tests/benchmark/pingpong.py +++ b/tests/benchmark/pingpong.py @@ -1,6 +1,7 @@ from charm4py import charm, Chare, Array, coro, Future from time import time -#import numpy as np +import numpy as np +from numba import cuda PAYLOAD = 100 # number of bytes NITER = 10000 @@ -15,11 +16,15 @@ def __init__(self): else: self.neighbor = self.thisProxy[0] - def start(self, done_future, threaded=False): + def start(self, done_future, threaded=False, gpu=False): + assert threaded ^ gpu self.done_future = done_future self.iter = 0 - #data = np.zeros(PAYLOAD, dtype='int8') - data = 3 + if not gpu: + data = np.zeros(PAYLOAD, dtype='int8') + else: + pass + # data = 3 self.startTime = time() if threaded: self.neighbor.recv_th(data) @@ -50,11 +55,13 @@ def main(args): threaded = False if len(args) > 1 and args[1] == '-t': threaded = True + elif len(args) >1 and args[1] == '--gpu': + gpu = True pings = Array(Ping, 2) charm.awaitCreation(pings) for _ in range(2): done_future = Future() - pings[0].start(done_future, threaded) + pings[0].start(done_future, threaded, gpu) totalTime = done_future.get() print("ping pong time per iter (us)=", totalTime / NITER * 1000000) exit() From 6804a96137325ac03133338e073b716525721f68 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 30 Dec 2020 10:21:49 -0500 Subject: [PATCH 002/107] Creation of standalone GPU pingpong file --- tests/benchmark/pingpong_gpu.py | 82 +++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 tests/benchmark/pingpong_gpu.py diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py new file mode 100644 index 00000000..2c304456 --- /dev/null +++ b/tests/benchmark/pingpong_gpu.py @@ -0,0 +1,82 @@ +from charm4py import charm, Chare, Array, coro, Future +from time import time +import numpy as np +from numba import cuda + +PAYLOAD = 100 # number of bytes +NITER = 10000 + + +class Ping(Chare): + + def __init__(self, gpu, num_iters): + self.gpu = gpu + self.myIndex = self.thisIndex[0] + if self.myIndex == 0: + self.neighbor = self.thisProxy[1] + else: + self.neighbor = self.thisProxy[0] + + def start(self, done_future, payload_size): + self.done_future = done_future + self.iter = 0 + data = np.zeros(payload_size, dtype='int8') + if self.gpu: + data = cuda.to_device(data) + self.startTime = time() + + else: + self.neighbor.recv(data) + + def recv(self, data): + data = cuda.to_device(data) + if self.myIndex == 0: + self.iter += 1 + if self.iter == NITER: + totalTime = time() - self.startTime + self.done_future.send(totalTime) + return + data = data.copy_to_host() + self.neighbor.recv(data) + + @coro + def recv_th(self, data): + if self.myIndex == 0: + self.iter += 1 + if self.iter == NITER: + totalTime = time() - self.startTime + self.done_future.send(totalTime) + return + self.neighbor.recv_th(data) + + +def main(args): + threaded = False + gpu = False + min_msg_size, max_mig_size, low_iter, high_iter, printFormat, gpu = 0 + if len(args) < 7: + print("Doesn't have the required input params. Usage:" + " \n" + ) + charm.exit(-1) + + min_msg_size = int(args[1]) + max_msg_size = int(args[2]) + low_iter = int(args[3]) + high_iter = int(args[4]) + print_format = int(args[5]) + gpu = int(args[6]) + + pings = Array(Ping, 2, gpu) + charm.awaitCreation(pings) + for _ in range(2): + done_future = Future() + pings[0].start(done_future, threaded, gpu) + totalTime = done_future.get() + print("ping pong time per iter (us)=", totalTime / NITER * 1000000) + exit() + + +charm.start(main) From 28203e145b3b44982eab572718f61cc4c7ad17f2 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 30 Dec 2020 12:07:24 -0500 Subject: [PATCH 003/107] Added CPU-only pingpong --- tests/benchmark/pingpong_gpu.py | 123 ++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 52 deletions(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 2c304456..b2c067ea 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -1,62 +1,74 @@ -from charm4py import charm, Chare, Array, coro, Future -from time import time +from charm4py import charm, Chare, Array, coro, Future, Channel, Group +import time import numpy as np from numba import cuda -PAYLOAD = 100 # number of bytes -NITER = 10000 +class Ping(Chare): + #TODO: How do we determine how many chares? + def __init__(self, use_gpudirect, print_format): + self.gpu_direct = use_gpudirect + self.num_chares = charm.numPes() + self.print_format = print_format + # self.am_low_chare = self.thisIndex < self.num_chares // 2 + self.am_low_chare = self.thisIndex == 0 -class Ping(Chare): + if self.am_low_chare: + print("Msg Size, Iterations, One-way Time (us), Bandwidth (bytes/us)") - def __init__(self, gpu, num_iters): - self.gpu = gpu - self.myIndex = self.thisIndex[0] - if self.myIndex == 0: - self.neighbor = self.thisProxy[1] - else: - self.neighbor = self.thisProxy[0] + @coro + def do_iteration(self, message_size, num_iters, done_future): + # TODO: How do we allocate device data again? + # dev_array = cuda.zeros(on_device) + # host_data = cuda.zeros(on_host) + data = np.zeros(message_size, dtype='int8') + partner_idx = int(not self.thisIndex) + # partner = self.thisProxy[self.thisIndex + self.num_chares // 2] + partner = self.thisProxy[partner_idx] + partner_channel = Channel(self, partner) - def start(self, done_future, payload_size): - self.done_future = done_future - self.iter = 0 - data = np.zeros(payload_size, dtype='int8') - if self.gpu: - data = cuda.to_device(data) - self.startTime = time() + tstart = time.time() - else: - self.neighbor.recv(data) - - def recv(self, data): - data = cuda.to_device(data) - if self.myIndex == 0: - self.iter += 1 - if self.iter == NITER: - totalTime = time() - self.startTime - self.done_future.send(totalTime) - return - data = data.copy_to_host() - self.neighbor.recv(data) - @coro - def recv_th(self, data): - if self.myIndex == 0: - self.iter += 1 - if self.iter == NITER: - totalTime = time() - self.startTime - self.done_future.send(totalTime) - return - self.neighbor.recv_th(data) + for _ in range(num_iters): + if self.am_low_chare: + if not self.gpu_direct: + dev_array = 0 # copy the device array to memory on device, TODO: use pinned memory? + # partner_channel.send(dev_array) + partner_channel.send(data) + partner_channel.recv() + + # if not self.gpu_direct: + else: + partner_channel.recv() + partner_channel.send(data) + # copy the data back to the device + + # TODO: should we have barrier (reduction) here? + tend = time.time() + elapsed_time = tend - tstart + + if self.am_low_chare: + # display data here + self.display_iteration_data(elapsed_time, num_iters, message_size) + + self.reduce(done_future) + + def display_iteration_data(self, elapsed_time, num_iters, message_size): + elapsed_time /= 2 # 1-way performance, not RTT + elapsed_time /= num_iters # Time for each message + bandwidth = message_size / elapsed_time + if self.print_format == 0: + print(f'{message_size},{num_iters},{elapsed_time * 1e6},{bandwidth / 1e6}') + else: + print('Not implemented!') def main(args): - threaded = False - gpu = False - min_msg_size, max_mig_size, low_iter, high_iter, printFormat, gpu = 0 if len(args) < 7: print("Doesn't have the required input params. Usage:" - " " + " \n" ) @@ -67,16 +79,23 @@ def main(args): low_iter = int(args[3]) high_iter = int(args[4]) print_format = int(args[5]) - gpu = int(args[6]) + use_gpudirect = int(args[6]) - pings = Array(Ping, 2, gpu) + pings = Group(Ping, args=[use_gpudirect, print_format]) charm.awaitCreation(pings) - for _ in range(2): + msg_size = min_msg_size + + while msg_size <= max_msg_size: + if msg_size <= 1048576: + iter = low_iter + else: + iter = high_iter done_future = Future() - pings[0].start(done_future, threaded, gpu) - totalTime = done_future.get() - print("ping pong time per iter (us)=", totalTime / NITER * 1000000) - exit() + pings.do_iteration(msg_size, iter, done_future) + done_future.get() + msg_size *= 2 + + charm.exit() charm.start(main) From 681129cd918f8eed6f71d87e3ba0af8d9024738c Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 30 Dec 2020 13:02:54 -0500 Subject: [PATCH 004/107] Added correct print format, pinned memory now used for staging host memory --- tests/benchmark/pingpong_gpu.py | 60 ++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index b2c067ea..9ade4ecb 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -3,66 +3,78 @@ import numpy as np from numba import cuda -class Ping(Chare): +USE_PINNED = True - #TODO: How do we determine how many chares? +class Ping(Chare): def __init__(self, use_gpudirect, print_format): self.gpu_direct = use_gpudirect self.num_chares = charm.numPes() self.print_format = print_format - # self.am_low_chare = self.thisIndex < self.num_chares // 2 self.am_low_chare = self.thisIndex == 0 if self.am_low_chare: - print("Msg Size, Iterations, One-way Time (us), Bandwidth (bytes/us)") + if print_format == 0: + print("Msg Size, Iterations, One-way Time (us), " + "Bandwidth (bytes/us)" + ) + else: + print(f'{"Msg Size": <30} {"Iterations": <25} ' + f'{"One-way Time (us)": <20} {"Bandwidth (bytes/us)": <20}' + ) @coro def do_iteration(self, message_size, num_iters, done_future): - # TODO: How do we allocate device data again? - # dev_array = cuda.zeros(on_device) - # host_data = cuda.zeros(on_host) - data = np.zeros(message_size, dtype='int8') + if USE_PINNED: + h_data = cuda.pinned_array(message_size, dtype='int8') + else: + h_data = np.zeros(message_size, dtype='int8') + d_data = cuda.device_array(message_size, dtype='int8') + d_data.copy_to_device(h_data) partner_idx = int(not self.thisIndex) - # partner = self.thisProxy[self.thisIndex + self.num_chares // 2] partner = self.thisProxy[partner_idx] partner_channel = Channel(self, partner) tstart = time.time() - for _ in range(num_iters): if self.am_low_chare: if not self.gpu_direct: - dev_array = 0 # copy the device array to memory on device, TODO: use pinned memory? - # partner_channel.send(dev_array) - partner_channel.send(data) - partner_channel.recv() + d_data.copy_to_host(h_data) + # partner_channel.send(dev_array) + partner_channel.send(h_data) + d_data.copy_to_device(partner_channel.recv()) + else: + raise NotImplementedError("TODO: GPU Direct") - # if not self.gpu_direct: else: - partner_channel.recv() - partner_channel.send(data) - # copy the data back to the device + if not self.gpu_direct: + d_data.copy_to_device(partner_channel.recv()) + d_data.copy_to_host(h_data) + partner_channel.send(h_data) + else: + raise NotImplementedError("TODO: GPU Direct") - # TODO: should we have barrier (reduction) here? tend = time.time() elapsed_time = tend - tstart if self.am_low_chare: - # display data here self.display_iteration_data(elapsed_time, num_iters, message_size) self.reduce(done_future) def display_iteration_data(self, elapsed_time, num_iters, message_size): - elapsed_time /= 2 # 1-way performance, not RTT - elapsed_time /= num_iters # Time for each message + elapsed_time /= 2 # 1-way performance, not RTT + elapsed_time /= num_iters # Time for each message bandwidth = message_size / elapsed_time if self.print_format == 0: - print(f'{message_size},{num_iters},{elapsed_time * 1e6},{bandwidth / 1e6}') + print(f'{message_size},{num_iters},{elapsed_time * 1e6},' + f'{bandwidth / 1e6}' + ) else: - print('Not implemented!') + print(f'{message_size: <30} {num_iters: <25} ' + f'{elapsed_time * 1e6: <20} {bandwidth / 1e6: <20}' + ) def main(args): if len(args) < 7: From 5fdfee3956708570b006bffab8c0942079f37f1c Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 30 Dec 2020 13:04:37 -0500 Subject: [PATCH 005/107] Stage script --- tests/benchmark/pingpong.py | 44 +++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/tests/benchmark/pingpong.py b/tests/benchmark/pingpong.py index cb367f81..2c304456 100644 --- a/tests/benchmark/pingpong.py +++ b/tests/benchmark/pingpong.py @@ -9,35 +9,34 @@ class Ping(Chare): - def __init__(self): + def __init__(self, gpu, num_iters): + self.gpu = gpu self.myIndex = self.thisIndex[0] if self.myIndex == 0: self.neighbor = self.thisProxy[1] else: self.neighbor = self.thisProxy[0] - def start(self, done_future, threaded=False, gpu=False): - assert threaded ^ gpu + def start(self, done_future, payload_size): self.done_future = done_future self.iter = 0 - if not gpu: - data = np.zeros(PAYLOAD, dtype='int8') - else: - pass - # data = 3 - self.startTime = time() - if threaded: - self.neighbor.recv_th(data) + data = np.zeros(payload_size, dtype='int8') + if self.gpu: + data = cuda.to_device(data) + self.startTime = time() + else: self.neighbor.recv(data) def recv(self, data): + data = cuda.to_device(data) if self.myIndex == 0: self.iter += 1 if self.iter == NITER: totalTime = time() - self.startTime self.done_future.send(totalTime) return + data = data.copy_to_host() self.neighbor.recv(data) @coro @@ -53,11 +52,24 @@ def recv_th(self, data): def main(args): threaded = False - if len(args) > 1 and args[1] == '-t': - threaded = True - elif len(args) >1 and args[1] == '--gpu': - gpu = True - pings = Array(Ping, 2) + gpu = False + min_msg_size, max_mig_size, low_iter, high_iter, printFormat, gpu = 0 + if len(args) < 7: + print("Doesn't have the required input params. Usage:" + " \n" + ) + charm.exit(-1) + + min_msg_size = int(args[1]) + max_msg_size = int(args[2]) + low_iter = int(args[3]) + high_iter = int(args[4]) + print_format = int(args[5]) + gpu = int(args[6]) + + pings = Array(Ping, 2, gpu) charm.awaitCreation(pings) for _ in range(2): done_future = Future() From 54126345db3077e5e6dbe0c3bc113a7b55b320c7 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 30 Dec 2020 13:09:41 -0500 Subject: [PATCH 006/107] Reset changes to pingpong --- tests/benchmark/pingpong.py | 43 +++++++++++-------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/tests/benchmark/pingpong.py b/tests/benchmark/pingpong.py index 2c304456..cb6060c2 100644 --- a/tests/benchmark/pingpong.py +++ b/tests/benchmark/pingpong.py @@ -1,7 +1,6 @@ from charm4py import charm, Chare, Array, coro, Future from time import time -import numpy as np -from numba import cuda +#import numpy as np PAYLOAD = 100 # number of bytes NITER = 10000 @@ -9,34 +8,31 @@ class Ping(Chare): - def __init__(self, gpu, num_iters): - self.gpu = gpu + def __init__(self): self.myIndex = self.thisIndex[0] if self.myIndex == 0: self.neighbor = self.thisProxy[1] else: self.neighbor = self.thisProxy[0] - def start(self, done_future, payload_size): + def start(self, done_future, threaded=False): self.done_future = done_future self.iter = 0 - data = np.zeros(payload_size, dtype='int8') - if self.gpu: - data = cuda.to_device(data) - self.startTime = time() - + #data = np.zeros(PAYLOAD, dtype='int8') + data = 3 + self.startTime = time() + if threaded: + self.neighbor.recv_th(data) else: self.neighbor.recv(data) def recv(self, data): - data = cuda.to_device(data) if self.myIndex == 0: self.iter += 1 if self.iter == NITER: totalTime = time() - self.startTime self.done_future.send(totalTime) return - data = data.copy_to_host() self.neighbor.recv(data) @coro @@ -52,28 +48,13 @@ def recv_th(self, data): def main(args): threaded = False - gpu = False - min_msg_size, max_mig_size, low_iter, high_iter, printFormat, gpu = 0 - if len(args) < 7: - print("Doesn't have the required input params. Usage:" - " \n" - ) - charm.exit(-1) - - min_msg_size = int(args[1]) - max_msg_size = int(args[2]) - low_iter = int(args[3]) - high_iter = int(args[4]) - print_format = int(args[5]) - gpu = int(args[6]) - - pings = Array(Ping, 2, gpu) + if len(args) > 1 and args[1] == '-t': + threaded = True + pings = Array(Ping, 2) charm.awaitCreation(pings) for _ in range(2): done_future = Future() - pings[0].start(done_future, threaded, gpu) + pings[0].start(done_future, threaded) totalTime = done_future.get() print("ping pong time per iter (us)=", totalTime / NITER * 1000000) exit() From 2d4b2c446b1806379f96e537efa5af5dd1df04f3 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 30 Dec 2020 13:18:22 -0500 Subject: [PATCH 007/107] Unify API between the CPU/GPU benchmarks --- tests/benchmark/pingpong.py | 137 +++++++++++++++++++++++------------- 1 file changed, 88 insertions(+), 49 deletions(-) diff --git a/tests/benchmark/pingpong.py b/tests/benchmark/pingpong.py index cb6060c2..6c027fb9 100644 --- a/tests/benchmark/pingpong.py +++ b/tests/benchmark/pingpong.py @@ -1,63 +1,102 @@ -from charm4py import charm, Chare, Array, coro, Future -from time import time -#import numpy as np +from charm4py import charm, Chare, Array, coro, Future, Channel, Group +import time +import numpy as np -PAYLOAD = 100 # number of bytes -NITER = 10000 +class Ping(Chare): + def __init__(self, use_zerocopy, print_format): + self.zero_copy = use_zerocopy + self.num_chares = charm.numPes() + self.print_format = print_format + self.am_low_chare = self.thisIndex == 0 + if self.am_low_chare: + if print_format == 0: + print("Msg Size, Iterations, One-way Time (us), " + "Bandwidth (bytes/us)" + ) + else: + print(f'{"Msg Size": <30} {"Iterations": <25} ' + f'{"One-way Time (us)": <20} {"Bandwidth (bytes/us)": <20}' + ) -class Ping(Chare): + @coro + def do_iteration(self, message_size, num_iters, done_future): + data = np.zeros(message_size, dtype='int8') + partner_idx = int(not self.thisIndex) + partner = self.thisProxy[partner_idx] + partner_channel = Channel(self, partner) - def __init__(self): - self.myIndex = self.thisIndex[0] - if self.myIndex == 0: - self.neighbor = self.thisProxy[1] - else: - self.neighbor = self.thisProxy[0] - - def start(self, done_future, threaded=False): - self.done_future = done_future - self.iter = 0 - #data = np.zeros(PAYLOAD, dtype='int8') - data = 3 - self.startTime = time() - if threaded: - self.neighbor.recv_th(data) - else: - self.neighbor.recv(data) + tstart = time.time() - def recv(self, data): - if self.myIndex == 0: - self.iter += 1 - if self.iter == NITER: - totalTime = time() - self.startTime - self.done_future.send(totalTime) - return - self.neighbor.recv(data) + for _ in range(num_iters): + if self.am_low_chare: + if not self.zero_copy: + partner_channel.send(data) + partner_channel.recv() + else: + raise NotImplementedError("TODO: ZeroCopy") - @coro - def recv_th(self, data): - if self.myIndex == 0: - self.iter += 1 - if self.iter == NITER: - totalTime = time() - self.startTime - self.done_future.send(totalTime) - return - self.neighbor.recv_th(data) + else: + if not self.zero_copy: + partner_channel.recv() + partner_channel.send(data) + else: + raise NotImplementedError("TODO: ZeroCopy") + + tend = time.time() + + elapsed_time = tend - tstart + if self.am_low_chare: + self.display_iteration_data(elapsed_time, num_iters, message_size) + + self.reduce(done_future) + + def display_iteration_data(self, elapsed_time, num_iters, message_size): + elapsed_time /= 2 # 1-way performance, not RTT + elapsed_time /= num_iters # Time for each message + bandwidth = message_size / elapsed_time + if self.print_format == 0: + print(f'{message_size},{num_iters},{elapsed_time * 1e6},' + f'{bandwidth / 1e6}' + ) + else: + print(f'{message_size: <30} {num_iters: <25} ' + f'{elapsed_time * 1e6: <20} {bandwidth / 1e6: <20}' + ) def main(args): - threaded = False - if len(args) > 1 and args[1] == '-t': - threaded = True - pings = Array(Ping, 2) + if len(args) < 7: + print("Doesn't have the required input params. Usage:" + " " + " \n" + ) + charm.exit(-1) + + min_msg_size = int(args[1]) + max_msg_size = int(args[2]) + low_iter = int(args[3]) + high_iter = int(args[4]) + print_format = int(args[5]) + use_zerocopy = int(args[6]) + + pings = Group(Ping, args=[use_zerocopy, print_format]) charm.awaitCreation(pings) - for _ in range(2): + msg_size = min_msg_size + + while msg_size <= max_msg_size: + if msg_size <= 1048576: + iter = low_iter + else: + iter = high_iter done_future = Future() - pings[0].start(done_future, threaded) - totalTime = done_future.get() - print("ping pong time per iter (us)=", totalTime / NITER * 1000000) - exit() + pings.do_iteration(msg_size, iter, done_future) + done_future.get() + msg_size *= 2 + + charm.exit() charm.start(main) From 76c50c706c28e75ac1dee6e3c67c712fbec87f3f Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 4 Jan 2021 11:05:48 -0600 Subject: [PATCH 008/107] Add macro test --- charm4py/charmlib/ccharm.pxd | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index 5bbe1b05..12111a1a 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -13,6 +13,11 @@ cdef extern from "charm.h": void LBTurnInstrumentOn(); void LBTurnInstrumentOff(); + """ + #if CMK_CHARM4PY + #warn "CMK_CHARM4PY macro access within Cython~~~~~\n\n\n\n\n\n\n" + """ + int CmiPeOnSamePhysicalNode(int pe1, int pe2); int CmiNumPhysicalNodes(); int CmiPhysicalNodeID(int pe); From ba4a9f17e34c7414936b71721ebb081893828b6a Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 6 Jan 2021 12:33:38 -0500 Subject: [PATCH 009/107] add methods for Charm++ CUDA interface --- charm4py/charmlib/ccharm.pxd | 7 ++----- charm4py/charmlib/charmlib_cython.pyx | 3 +++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index 12111a1a..0da6c355 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -13,11 +13,6 @@ cdef extern from "charm.h": void LBTurnInstrumentOn(); void LBTurnInstrumentOff(); - """ - #if CMK_CHARM4PY - #warn "CMK_CHARM4PY macro access within Cython~~~~~\n\n\n\n\n\n\n" - """ - int CmiPeOnSamePhysicalNode(int pe1, int pe2); int CmiNumPhysicalNodes(); int CmiPhysicalNodeID(int pe); @@ -75,6 +70,8 @@ cdef extern from "charm.h": void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep); void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs); + int CkCudaEnabled(); + int CUDAPointerOnDevice(const void *ptr); cdef extern from "spanningTree.h": void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor, diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index aad5f323..7ed31051 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -506,6 +506,9 @@ class CharmLib(object): CkRegisterArrayExt(self.chareNames[-1], numEntryMethods, &chareIdx, &startEpIdx) return chareIdx, startEpIdx + def CUDAPointerOnDevice(self, long address): + return CUDAPointerOnDevice(address) + def CkCreateGroup(self, int chareIdx, int epIdx, msg not None): global cur_buf msg0, dcopy = msg From 3f3467133a216343298e45b565232b7b69352c87 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sat, 9 Jan 2021 09:25:48 -0500 Subject: [PATCH 010/107] sender-side GPU direct --- charm4py/chare.py | 20 +++++++++-- charm4py/charmlib/ccharm.pxd | 9 +++++ charm4py/charmlib/charmlib_cython.pyx | 52 +++++++++++++++++++++++++-- 3 files changed, 76 insertions(+), 5 deletions(-) diff --git a/charm4py/chare.py b/charm4py/chare.py index a1447eb4..2e54b766 100644 --- a/charm4py/chare.py +++ b/charm4py/chare.py @@ -721,7 +721,10 @@ def proxy_entry_method(proxy, *args, **kwargs): for i in range(num_args, argcount): argname = argnames[i] # first look for argument in kwargs - if argname in kwargs: + # TODO: Should stream_ptrs be skipped? + if argname == 'stream_ptrs': + continue + if argname in kwargs and argname: args.append(kwargs[argname]) else: # if not there, see if there is a default value @@ -741,15 +744,26 @@ def proxy_entry_method(proxy, *args, **kwargs): if elemIdx == (): header[b'bcast'] = True if not proxy.issec or elemIdx != (): + # TODO: Check that this is channel proxy method? destObj = None aid = proxy.aid if Options.local_msg_optim and (len(args) > 0): array = charm.arrays[aid] if elemIdx in array: destObj = array[elemIdx] - msg = charm.packMsg(destObj, args, header) - charm.CkArraySend(aid, elemIdx, ep, msg) + msg, has_gpu_data = charm.packMsg(destObj, args, header) + if has_gpu_data: + if 'stream_ptrs' in kwargs and kwargs['stream_ptrs']: + stream_ptrs = kwargs['stream_ptrs'] + else: + stream_ptrs = None + charm.CkArraySendWithDeviceData(aid, elemIdx, ep, + msg, stream_ptrs + ) + else: + charm.CkArraySend(aid, elemIdx, ep, msg) else: + # TODO: Error if trying to send ZC data root, sid = proxy.section header[b'sid'] = sid if Options.local_msg_optim and root == charm._myPe: diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index 0da6c355..debcd537 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -72,6 +72,15 @@ cdef extern from "charm.h": int CkCudaEnabled(); int CUDAPointerOnDevice(const void *ptr); + void CkChareExtSendWithDeviceData(int aid, int *idx, int ndims, + int epIdx, int num_bufs, char *msg, + int msgSize, + void *devBufPtrs, + void *devBufSizesInBytes, + void *streamPtrs, int numDevBufs + ); + + cdef extern from "spanningTree.h": void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor, diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 7ed31051..0b5882f6 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -1,6 +1,6 @@ from ccharm cimport * from libc.stdlib cimport malloc, free -from libc.string cimport memcpy +from libc.string cimport memcpy, memset from libc.stdint cimport uintptr_t from cpython.version cimport PY_MAJOR_VERSION from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_ANY_CONTIGUOUS, PyBUF_SIMPLE @@ -292,12 +292,16 @@ cdef extern const char * const CmiCommitID cdef (char*)[NUM_DCOPY_BUFS] send_bufs # ?TODO bounds checking is needed where this is used cdef int[NUM_DCOPY_BUFS] send_buf_sizes # ?TODO bounds checking is needed where this is used cdef int cur_buf = 1 +cdef int gpu_direct_buf_idx = 0 cdef int[MAX_INDEX_LEN] c_index cdef Py_buffer send_buffer cdef ReceiveMsgBuffer recv_buffer = ReceiveMsgBuffer() cdef c_type_table_typecodes = [None] * 13 cdef int c_type_table_sizes[13] cdef int[SECTION_MAX_BFACTOR] section_children +cdef long[NUM_DCOPY_BUFS] gpu_direct_device_ptrs +cdef long[NUM_DCOPY_BUFS] gpu_direct_buff_sizes +cdef long[NUM_DCOPY_BUFS] gpu_direct_stream_ptrs cdef object charm cdef object charm_reducer_to_ctype @@ -449,6 +453,35 @@ class CharmLib(object): CkGroupExtSend_multi(group_id, num_pes, section_children, ep, cur_buf, send_bufs, send_buf_sizes) cur_buf = 1 + def CkArraySendWithDeviceData(self, int array_id, index not None, int ep, + msg not None, list stream_ptrs): + + global gpu_direct_buf_idx + cdef int i = 0 + cdef int ndims = len(index) + assert ndims == 1 + c_index[0] = index[0] + msg0, dcopy = msg + cdef int num_direct_buffers = gpu_direct_buf_idx + 1 + # TODO: Message on assertion failure + assert num_direct_buffers <= NUM_DCOPY_BUFS + global gpu_direct_device_ptrs + global gpu_direct_stream_ptrs + + if stream_ptrs: + for i in range(num_direct_buffers): + gpu_direct_stream_ptrs[i] = stream_ptrs[i] + else: + memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers) + + CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0), + gpu_direct_device_ptrs, + gpu_direct_buff_sizes, + gpu_direct_stream_ptrs, + num_direct_buffers + ) + gpu_direct_buf_idx = 0 + def CkArraySend(self, int array_id, index not None, int ep, msg not None): global cur_buf msg0, dcopy = msg @@ -787,8 +820,12 @@ class CharmLib(object): else: direct_copy_hdr = [] # goes to header args = list(msgArgs) + msg_has_gpu_args = False global cur_buf + global gpu_direct_buf_idx + global gpu_direct_device_ptrs cur_buf = 1 + gpu_direct_buf_idx = 0 for i in range(len(args)): arg = msgArgs[i] if isinstance(arg, np.ndarray) and not arg.dtype.hasobject: @@ -806,6 +843,15 @@ class CharmLib(object): nbytes = len(a) * a.itemsize # NOTE that cython's array C interface doesn't expose itemsize attribute direct_copy_hdr.append((i, 1, (a.typecode), nbytes)) send_bufs[cur_buf] = a.data.as_voidptr + elif CkCudaEnabled() and hasattr(arg, '__cuda_array_interface__'): + # we want to take the args that implement the cuda array interface and make them into ckdevicebuffers + # assumption: we can get nbytes from the arg directly + # TODO: verify this assertion for other types + gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg['__cuda_array_interface__']['data'][0] + gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes + cuda_dev_info = True + gpu_direct_buf_idx += 1 + continue else: continue args[i] = None # will direct-copy this arg so remove from args list @@ -817,10 +863,12 @@ class CharmLib(object): msg = dumps((header, args), PICKLE_PROTOCOL) except: global cur_buf + global gpu_direct_buf_idx cur_buf = 1 + gpu_direct_buf_idx = 0 raise if PROFILING: charm.recordSend(len(msg) + dcopy_size) - return msg, None + return msg, cuda_dev_info def scheduleTagAfter(self, int tag, double msecs): CcdCallFnAfter(CcdCallFnAfterCallback, tag, msecs) From 4048015d5ee6132aec69ac494ba6184f54f59944 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 11 Jan 2021 19:12:34 -0500 Subject: [PATCH 011/107] add method to get GPU data --- charm4py/channel.py | 9 ++++++++- charm4py/charm.py | 23 ++++++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index aaf058ec..c6748ee7 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -1,4 +1,5 @@ from .threads import LocalFuture +from .charm4py import getGPUDirectData class Channel(object): @@ -60,7 +61,7 @@ def send(self, *msg): self.remote._channelRecv__(self.remote_port, self.send_seqno, *msg) self.send_seqno = (self.send_seqno + 1) % CHAN_BUF_SIZE - def recv(self): + def recv(self, *post_buffers, stream_ptrs = None: if self.recv_seqno in self.data: ret = self.data.pop(self.recv_seqno) else: @@ -68,4 +69,10 @@ def recv(self): ret = self.recv_fut.get() self.recv_fut = None self.recv_seqno = (self.recv_seqno + 1) % CHAN_BUF_SIZE + + if post_buffers: + gpu_recv_bufs = ret.pop() + assert len(post_buffers) == len(gpu_recv_bufs) + recv_future = getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) + recv_future.get() return ret diff --git a/charm4py/charm.py b/charm4py/charm.py index a13935fe..bc9b8d9d 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -123,6 +123,12 @@ def __init__(self): self.reducers = reduction.ReducerContainer(self) self.redMgr = reduction.ReductionManager(self, self.reducers) self.mainchareRegistered = False + # TODO: create a 'charm' CUDA interface + try: + from numba import cuda as numba_cuda + self.CUDA = numba_cuda + except ImportError: + raise Charm4PyError("Currently numba is required to use Charm4Py (temporary)") # entry point to Charm program. can be used in place of defining a Mainchare self.entry_func = None if self.lib.name == 'cython': @@ -305,6 +311,14 @@ def recvArrayMsg(self, aid, index, ep, msg, dcopy_start): self.arrays[aid][index] = obj em.run(obj, header, args) # now call the user's array element __init__ + def recvGPUDirectMsg(self, aid, index, ep, + devBuf_ptrs, msg, dcopy_start + ): + obj = self.arrays[aid][index] + header, args = self.unpackMsg(msg, dcopy_start, obj) + args.append(devBuf_ptrs) + self.invokeEntryMethod(obj, ep, header, args) + def recvArrayBcast(self, aid, indexes, ep, msg, dcopy_start): header, args = self.unpackMsg(msg, dcopy_start, None) array = self.arrays[aid] @@ -312,6 +326,7 @@ def recvArrayBcast(self, aid, indexes, ep, msg, dcopy_start): self.invokeEntryMethod(array[index], ep, header, args) def unpackMsg(self, msg, dcopy_start, dest_obj): + # Issue Rgets for GPU data in unpackMsg? But how does recv work? if msg[:7] == b'_local:': header, args = dest_obj.__removeLocal__(int(msg[7:])) else: @@ -332,6 +347,13 @@ def unpackMsg(self, msg, dcopy_start, dest_obj): return header, args + def getGPUDirectData(self, post_buffers, gpu_recv_bufs, stream_ptrs): + return_fut = self.Future() + if not streams: + stream_ptrs = [0] * len(post_buffers) + self.lib.getGPUDirectData(return_fut, post_buffers, gpu_recv_bufs, stream_ptrs) + return return_fut + def packMsg(self, destObj, msgArgs, header): """Prepares a message for sending, given arguments to an entry method invocation. @@ -1156,6 +1178,5 @@ def rebuildNumpyArray(data, shape, dt): a.shape = shape return a.copy() - charm = Charm() readonlies = __ReadOnlies() From fc535bbcc53548742c2cb50ac9fa3e5ee9ccaefe Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 11 Jan 2021 19:13:25 -0500 Subject: [PATCH 012/107] add methods to support receiver-side GPU Direct --- charm4py/charmlib/ccharm.pxd | 8 +++++ charm4py/charmlib/charmlib_cython.pyx | 43 ++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index debcd537..2ec0dafb 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -70,6 +70,7 @@ cdef extern from "charm.h": void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep); void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs); + # TODO: Organize these to place them near their related functions int CkCudaEnabled(); int CUDAPointerOnDevice(const void *ptr); void CkChareExtSendWithDeviceData(int aid, int *idx, int ndims, @@ -80,9 +81,16 @@ cdef extern from "charm.h": void *streamPtrs, int numDevBufs ); + void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int)); + cdef extern from "spanningTree.h": void getPETopoTreeEdges(int pe, int rootPE, int *pes, int numpes, unsigned int bfactor, int *parent, int *child_count, int **children); + +# cdef extern from "ckrdmadevice.h": +# cdef cppclass CkDeviceBuffer: +# CkDeviceBuffer() + diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 0b5882f6..af1e4a43 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -285,9 +285,18 @@ cdef inline object array_index_to_tuple(int ndims, int *arrayIndex): PyTuple_SET_ITEM(arrIndex, i, d) return arrIndex - cdef extern const char * const CmiCommitID +# cdef class PyCkDeviceBuffer: +# cdef CkDeviceBuffer c_buff + +# @staticmethod +# cdef PyCkDeviceBuffer from_ptr(CkDeviceBuffer buf): +# cdef PyCkDeviceBuffer newBuf = PyCkDeviceBuffer.__new__(PyCkDeviceBuffer) +# newBuf.c_buff = buf +# return newBuf + + # supports up to NUM_DCOPY_BUFS direct-copy entry method arguments cdef (char*)[NUM_DCOPY_BUFS] send_bufs # ?TODO bounds checking is needed where this is used cdef int[NUM_DCOPY_BUFS] send_buf_sizes # ?TODO bounds checking is needed where this is used @@ -739,6 +748,7 @@ class CharmLib(object): registerChareMsgRecvExtCallback(recvChareMsg) registerGroupMsgRecvExtCallback(recvGroupMsg) registerArrayMsgRecvExtCallback(recvArrayMsg) + registerArrayMsgGPUDirectRecvExtCallback(recvGPUDirectMsg) registerArrayBcastRecvExtCallback(recvArrayBcast) registerArrayMapProcNumExtCallback(arrayMapProcNum) registerArrayElemJoinExtCallback(arrayElemJoin) @@ -873,6 +883,17 @@ class CharmLib(object): def scheduleTagAfter(self, int tag, double msecs): CcdCallFnAfter(CcdCallFnAfterCallback, tag, msecs) + def getGPUDirectData(self, list postbuf_ptrs, list gpu_recv_bufs, list stream_ptrs): + cdef int num_buffers = len(gpu_recv_bufs) + cdef int[num_buffers] gpu_buf_sizes + cdef (int*)[num_buffers] gpu_buf_ptrs + cdef int[num_buffers] stream_ptrs + + for idx in range(num_buffers): + gpu_buf_sizes[idx] = gpu_recv_bufs[idx][0] + gpu_buf_ptrs[idx] = gpu_recv_bufs[idx][1] + stream_ptrs[idx] = streams_ptrs[idx] + CkGetGPUDirectData() # first callback from Charm++ shared library cdef void registerMainModule(): @@ -925,6 +946,25 @@ cdef void recvArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int msgSize, except: charm.handleGeneralError() +cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numDevBuffs, + long *devBufSizes, void *devBufs, int msgSize, + char *msg, int dcopy_start): + cdef int idx = 0 + cdef void *bptr + try: + if PROFILING: + charm._precvtime = time.time() + charm.recordReceive(msgSize) + devBufInfo = [] + for idx in range(numDevBuffs): + dev_buf = devBufs[idx] + devBufInfo.append((devBufSizes[idx], dev_buf)) + recv_buffer.setMsg(msg, msgSize) + charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, recv_buffer, mg, dcopy_start) + except: + charm.handleGeneralError() + + cdef void recvArrayBcast(int aid, int ndims, int nInts, int numElems, int *arrayIndexes, int ep, int msgSize, char *msg, int dcopy_start): cdef int i = 0 try: @@ -1107,3 +1147,4 @@ cdef void CcdCallFnAfterCallback(void *userParam, double curWallTime): charm.triggerCallable(userParam) except: charm.handleGeneralError() + From 67ed75bd2afb52eb39780b206b91fd6c3751a2d5 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Fri, 15 Jan 2021 16:07:22 -0500 Subject: [PATCH 013/107] hook up to Charm++ GPUDirect functionality --- charm4py/charmlib/ccharm.pxd | 1 + charm4py/charmlib/charmlib_cython.pyx | 35 +++++++++++++++++++-------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index 2ec0dafb..bb8c1c3a 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -82,6 +82,7 @@ cdef extern from "charm.h": ); void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int)); + void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes, void *remoteBufInfo, void *streamPtrs); diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index af1e4a43..7c45f856 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -883,17 +883,32 @@ class CharmLib(object): def scheduleTagAfter(self, int tag, double msecs): CcdCallFnAfter(CcdCallFnAfterCallback, tag, msecs) - def getGPUDirectData(self, list postbuf_ptrs, list gpu_recv_bufs, list stream_ptrs): - cdef int num_buffers = len(gpu_recv_bufs) - cdef int[num_buffers] gpu_buf_sizes - cdef (int*)[num_buffers] gpu_buf_ptrs - cdef int[num_buffers] stream_ptrs + def getGPUDirectData(self, list post_buf_data, list remote_bufs, list stream_ptrs, return_fut): + cdef int num_buffers = len(post_buf_data) + cdef array.array int_array_template = array.array('i', []) + cdef array.array long_array_template = array.array('L', []) + cdef array.array recv_buf_sizes + cdef array.array recv_buf_ptrs + # pointers from the remote that we will be issuing Rgets for + # these are pointers to type CkDeviceBuffer + cdef array.array remote_buf_ptrs + + recv_buf_sizes = array.clone(int_array_template, num_buffers, zero=False) + recv_buf_ptrs = array.clone(long_array_template, num_buffers, zero=False) + stream_ptrs_forc = array.clone(long_array_template, num_buffers, zero=False) + remote_buf_ptrs = array.clone(long_array_template, num_buffers, zero=False) for idx in range(num_buffers): - gpu_buf_sizes[idx] = gpu_recv_bufs[idx][0] - gpu_buf_ptrs[idx] = gpu_recv_bufs[idx][1] - stream_ptrs[idx] = streams_ptrs[idx] - CkGetGPUDirectData() + recv_buf_ptrs[idx] = post_buf_data[idx][0] + recv_buf_sizes[idx] = remote_bufs[idx][0] + remote_buf_ptrs = remote_bufs[idx][1] + stream_ptrs_forc[idx] = stream_ptrs[idx] + # what do we do about the return future? Need to turn it into some callback. + CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr, + recv_buf_sizes.data.as_voidptr, + remote_buf_ptrs.data.as_voidptr, + stream_ptrs_forc.data.as_voidptr + ) # first callback from Charm++ shared library cdef void registerMainModule(): @@ -960,7 +975,7 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD dev_buf = devBufs[idx] devBufInfo.append((devBufSizes[idx], dev_buf)) recv_buffer.setMsg(msg, msgSize) - charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, recv_buffer, mg, dcopy_start) + charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, recv_buffer, msg, dcopy_start) except: charm.handleGeneralError() From 0f8bf705afe8138e522af92e5582a30679ce8c1a Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Fri, 15 Jan 2021 16:08:11 -0500 Subject: [PATCH 014/107] hooks into lib --- charm4py/charm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index bc9b8d9d..f4da7506 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -40,6 +40,11 @@ class NumpyDummy: def SECTION_ALL(obj): return 0 +def getDeviceDataInfo(devArray): + return devArray['__cuda_array_interface__']['data'] + +def getDeviceDataAddress(devArray): + return getDeviceDataInfo(devArray[0]) class Options(object): @@ -123,12 +128,6 @@ def __init__(self): self.reducers = reduction.ReducerContainer(self) self.redMgr = reduction.ReductionManager(self, self.reducers) self.mainchareRegistered = False - # TODO: create a 'charm' CUDA interface - try: - from numba import cuda as numba_cuda - self.CUDA = numba_cuda - except ImportError: - raise Charm4PyError("Currently numba is required to use Charm4Py (temporary)") # entry point to Charm program. can be used in place of defining a Mainchare self.entry_func = None if self.lib.name == 'cython': @@ -347,11 +346,12 @@ def unpackMsg(self, msg, dcopy_start, dest_obj): return header, args - def getGPUDirectData(self, post_buffers, gpu_recv_bufs, stream_ptrs): + def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs): return_fut = self.Future() + post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers] if not streams: stream_ptrs = [0] * len(post_buffers) - self.lib.getGPUDirectData(return_fut, post_buffers, gpu_recv_bufs, stream_ptrs) + self.lib.getGPUDirectData(post_buf_data, remote_bufs, stream_ptrs, return_fut) return return_fut def packMsg(self, destObj, msgArgs, header): From 18b4e136209ef0d8847c767635e9131ccaae8d5c Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Fri, 15 Jan 2021 16:08:32 -0500 Subject: [PATCH 015/107] fix syntax error --- charm4py/channel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index c6748ee7..e6824ecb 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -1,5 +1,5 @@ from .threads import LocalFuture -from .charm4py import getGPUDirectData +from .charm import Charm class Channel(object): @@ -61,7 +61,7 @@ def send(self, *msg): self.remote._channelRecv__(self.remote_port, self.send_seqno, *msg) self.send_seqno = (self.send_seqno + 1) % CHAN_BUF_SIZE - def recv(self, *post_buffers, stream_ptrs = None: + def recv(self, *post_buffers, stream_ptrs = None): if self.recv_seqno in self.data: ret = self.data.pop(self.recv_seqno) else: @@ -73,6 +73,6 @@ def recv(self, *post_buffers, stream_ptrs = None: if post_buffers: gpu_recv_bufs = ret.pop() assert len(post_buffers) == len(gpu_recv_bufs) - recv_future = getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) + Charm.recv_future = getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) recv_future.get() return ret From 22297d27e3d41f28239e5296714f69685482d76c Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sun, 17 Jan 2021 10:25:31 -0500 Subject: [PATCH 016/107] fix libcharm call --- charm4py/charm.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index f4da7506..6bb3f28a 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -41,10 +41,10 @@ def SECTION_ALL(obj): return 0 def getDeviceDataInfo(devArray): - return devArray['__cuda_array_interface__']['data'] + return devArray.__cuda_array_interface__['data'] def getDeviceDataAddress(devArray): - return getDeviceDataInfo(devArray[0]) + return getDeviceDataInfo(devArray)[0] class Options(object): @@ -125,6 +125,7 @@ def __init__(self): self.CkChareSend = self.lib.CkChareSend self.CkGroupSend = self.lib.CkGroupSend self.CkArraySend = self.lib.CkArraySend + self.CkArraySendWithDeviceData = self.lib.CkArraySendWithDeviceData self.reducers = reduction.ReducerContainer(self) self.redMgr = reduction.ReductionManager(self, self.reducers) self.mainchareRegistered = False @@ -347,9 +348,9 @@ def unpackMsg(self, msg, dcopy_start, dest_obj): return header, args def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs): - return_fut = self.Future() + return_fut = self.Future(len(post_buffers)) post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers] - if not streams: + if not stream_ptrs: stream_ptrs = [0] * len(post_buffers) self.lib.getGPUDirectData(post_buf_data, remote_bufs, stream_ptrs, return_fut) return return_fut From 0b316d6fdd602326122a960d95486b6673f87ff5 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sun, 17 Jan 2021 10:25:48 -0500 Subject: [PATCH 017/107] gpu_recv_bufs now correctly retrieved --- charm4py/channel.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index e6824ecb..c7883a7f 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -1,5 +1,5 @@ from .threads import LocalFuture -from .charm import Charm +from .charm import charm class Channel(object): @@ -71,8 +71,15 @@ def recv(self, *post_buffers, stream_ptrs = None): self.recv_seqno = (self.recv_seqno + 1) % CHAN_BUF_SIZE if post_buffers: - gpu_recv_bufs = ret.pop() + gpu_recv_bufs = ret[-1] + ret = ret[:-1:1] assert len(post_buffers) == len(gpu_recv_bufs) - Charm.recv_future = getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) + recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) recv_future.get() return ret + + + + + + From 01a31d04d5ab3413497b4d6bb4299d9698500728 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sun, 17 Jan 2021 10:26:26 -0500 Subject: [PATCH 018/107] fix incorrect check for CUDA data --- charm4py/chare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charm4py/chare.py b/charm4py/chare.py index 2e54b766..58ab2c06 100644 --- a/charm4py/chare.py +++ b/charm4py/chare.py @@ -751,8 +751,8 @@ def proxy_entry_method(proxy, *args, **kwargs): array = charm.arrays[aid] if elemIdx in array: destObj = array[elemIdx] - msg, has_gpu_data = charm.packMsg(destObj, args, header) - if has_gpu_data: + msg = charm.packMsg(destObj, args, header) + if msg[1]: if 'stream_ptrs' in kwargs and kwargs['stream_ptrs']: stream_ptrs = kwargs['stream_ptrs'] else: From 683a1c0ac43dbb429848d5997a6c4449a45a01e0 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sun, 17 Jan 2021 10:26:42 -0500 Subject: [PATCH 019/107] add more API calls --- charm4py/charmlib/ccharm.pxd | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index bb8c1c3a..7f2a34b2 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -76,13 +76,18 @@ cdef extern from "charm.h": void CkChareExtSendWithDeviceData(int aid, int *idx, int ndims, int epIdx, int num_bufs, char *msg, int msgSize, - void *devBufPtrs, - void *devBufSizesInBytes, - void *streamPtrs, int numDevBufs + long *devBufPtrs, + long *devBufSizesInBytes, + long *streamPtrs, int numDevBufs ); void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int)); - void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes, void *remoteBufInfo, void *streamPtrs); + void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes, + void *remoteBufInfo, void *streamPtrs, int *futureId); + + int CkDeviceBufferSizeInBytes(); + + void registerDepositFutureWithIdFn(void (*cb)(void*, void*)); From 91a1737560d677d4a35021fed4d92b907590222f Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sun, 17 Jan 2021 10:28:24 -0500 Subject: [PATCH 020/107] add call to register future deposit, add size of ck device buffer in bytes --- charm4py/charmlib/charmlib_cython.pyx | 62 ++++++++++++++++----------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 7c45f856..f62be726 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -30,7 +30,7 @@ ELSE: np = NumpyDummyModule() cdef object np_number = np.number - +cdef int CK_DEVICEBUFFER_SIZE_IN_BYTES = CkDeviceBufferSizeInBytes() # ------ global constants ------ @@ -468,10 +468,11 @@ class CharmLib(object): global gpu_direct_buf_idx cdef int i = 0 cdef int ndims = len(index) - assert ndims == 1 - c_index[0] = index[0] + # assert ndims == 1 + for i in range(ndims): c_index[i] = index[i] msg0, dcopy = msg - cdef int num_direct_buffers = gpu_direct_buf_idx + 1 + dcopy = None + cdef int num_direct_buffers = gpu_direct_buf_idx # TODO: Message on assertion failure assert num_direct_buffers <= NUM_DCOPY_BUFS global gpu_direct_device_ptrs @@ -484,9 +485,9 @@ class CharmLib(object): memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers) CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0), - gpu_direct_device_ptrs, - gpu_direct_buff_sizes, - gpu_direct_stream_ptrs, + gpu_direct_device_ptrs, + gpu_direct_buff_sizes, + gpu_direct_stream_ptrs, num_direct_buffers ) gpu_direct_buf_idx = 0 @@ -754,6 +755,7 @@ class CharmLib(object): registerArrayElemJoinExtCallback(arrayElemJoin) registerPyReductionExtCallback(pyReduction) registerCreateCallbackMsgExtCallback(createCallbackMsg) + registerDepositFutureWithIdFn(depositFutureWithId); def CkMyPe(self): return CkMyPeHook() def CkNumPes(self): return CkNumPesHook() @@ -820,6 +822,7 @@ class CharmLib(object): cdef array.array a IF HAVE_NUMPY: cdef np.ndarray np_array + cuda_dev_info = None dcopy_size = 0 if destObj is not None: # if dest obj is local localTag = destObj.__addLocal__((header, msgArgs)) @@ -830,7 +833,6 @@ class CharmLib(object): else: direct_copy_hdr = [] # goes to header args = list(msgArgs) - msg_has_gpu_args = False global cur_buf global gpu_direct_buf_idx global gpu_direct_device_ptrs @@ -838,7 +840,17 @@ class CharmLib(object): gpu_direct_buf_idx = 0 for i in range(len(args)): arg = msgArgs[i] - if isinstance(arg, np.ndarray) and not arg.dtype.hasobject: + if CkCudaEnabled() and hasattr(arg, '__cuda_array_interface__'): + # we want to take the args that implement the cuda array interface and make them into ckdevicebuffers + # assumption: we can get nbytes from the arg directly + # TODO: verify this assertion for other types + gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg.__cuda_array_interface__['data'][0] + gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes + cuda_dev_info = True + gpu_direct_buf_idx += 1 + args[i] = None # TODO: should this be done? + continue + elif isinstance(arg, np.ndarray) and not arg.dtype.hasobject: np_array = arg nbytes = np_array.nbytes direct_copy_hdr.append((i, 2, (arg.shape, np_array.dtype.name), nbytes)) @@ -853,15 +865,6 @@ class CharmLib(object): nbytes = len(a) * a.itemsize # NOTE that cython's array C interface doesn't expose itemsize attribute direct_copy_hdr.append((i, 1, (a.typecode), nbytes)) send_bufs[cur_buf] = a.data.as_voidptr - elif CkCudaEnabled() and hasattr(arg, '__cuda_array_interface__'): - # we want to take the args that implement the cuda array interface and make them into ckdevicebuffers - # assumption: we can get nbytes from the arg directly - # TODO: verify this assertion for other types - gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg['__cuda_array_interface__']['data'][0] - gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes - cuda_dev_info = True - gpu_direct_buf_idx += 1 - continue else: continue args[i] = None # will direct-copy this arg so remove from args list @@ -885,6 +888,7 @@ class CharmLib(object): def getGPUDirectData(self, list post_buf_data, list remote_bufs, list stream_ptrs, return_fut): cdef int num_buffers = len(post_buf_data) + cdef int *future_id = malloc(sizeof(int)) cdef array.array int_array_template = array.array('i', []) cdef array.array long_array_template = array.array('L', []) cdef array.array recv_buf_sizes @@ -899,15 +903,16 @@ class CharmLib(object): remote_buf_ptrs = array.clone(long_array_template, num_buffers, zero=False) for idx in range(num_buffers): - recv_buf_ptrs[idx] = post_buf_data[idx][0] + recv_buf_ptrs[idx] = post_buf_data[idx] recv_buf_sizes[idx] = remote_bufs[idx][0] - remote_buf_ptrs = remote_bufs[idx][1] + remote_buf_ptrs[idx] = remote_bufs[idx][1] stream_ptrs_forc[idx] = stream_ptrs[idx] # what do we do about the return future? Need to turn it into some callback. CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr, recv_buf_sizes.data.as_voidptr, remote_buf_ptrs.data.as_voidptr, - stream_ptrs_forc.data.as_voidptr + stream_ptrs_forc.data.as_voidptr, + future_id ) # first callback from Charm++ shared library @@ -965,17 +970,18 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD long *devBufSizes, void *devBufs, int msgSize, char *msg, int dcopy_start): cdef int idx = 0 - cdef void *bptr try: if PROFILING: charm._precvtime = time.time() charm.recordReceive(msgSize) devBufInfo = [] for idx in range(numDevBuffs): - dev_buf = devBufs[idx] - devBufInfo.append((devBufSizes[idx], dev_buf)) + # Add the size of this buffer and a pointer to it to the info list + devBufInfo.append((devBufSizes[idx], + (devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx))) + ) recv_buffer.setMsg(msg, msgSize) - charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, recv_buffer, msg, dcopy_start) + charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start) except: charm.handleGeneralError() @@ -1037,6 +1043,12 @@ cdef void resumeFromSync(int aid, int ndims, int *arrayIndex): except: charm.handleGeneralError() +cdef void depositFutureWithId(void *param, void *msg): +# TODO: Figure out how this param value should be allocated/deallocated + cdef int futureId = ( param)[0] + free(param) + charm._future_deposit_result(futureId) + cdef void createCallbackMsg(void *data, int dataSize, int reducerType, int fid, int *sectionInfo, char **returnBuffers, int *returnBufferSizes): cdef int numElems From e012722919c2104c668d4b39811603f890f54c66 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sun, 17 Jan 2021 10:28:51 -0500 Subject: [PATCH 021/107] WIP --- tests/benchmark/pingpong_gpu.py | 36 ++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 9ade4ecb..30310db7 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -1,16 +1,16 @@ -from charm4py import charm, Chare, Array, coro, Future, Channel, Group +from charm4py import charm, Chare, Array, coro, Future, Channel, Group, ArrayMap import time import numpy as np from numba import cuda -USE_PINNED = True +USE_PINNED = False class Ping(Chare): def __init__(self, use_gpudirect, print_format): self.gpu_direct = use_gpudirect self.num_chares = charm.numPes() self.print_format = print_format - self.am_low_chare = self.thisIndex == 0 + self.am_low_chare = self.thisIndex[0] == 0 if self.am_low_chare: if print_format == 0: @@ -27,10 +27,13 @@ def do_iteration(self, message_size, num_iters, done_future): if USE_PINNED: h_data = cuda.pinned_array(message_size, dtype='int8') else: - h_data = np.zeros(message_size, dtype='int8') + if self.am_low_chare: + h_data = np.ones(message_size, dtype='int8') + else: + h_data = np.zeros(message_size, dtype='int8') d_data = cuda.device_array(message_size, dtype='int8') d_data.copy_to_device(h_data) - partner_idx = int(not self.thisIndex) + partner_idx = int(not self.thisIndex[0]) partner = self.thisProxy[partner_idx] partner_channel = Channel(self, partner) @@ -40,11 +43,14 @@ def do_iteration(self, message_size, num_iters, done_future): if self.am_low_chare: if not self.gpu_direct: d_data.copy_to_host(h_data) - # partner_channel.send(dev_array) partner_channel.send(h_data) d_data.copy_to_device(partner_channel.recv()) else: - raise NotImplementedError("TODO: GPU Direct") + partner_channel.send(d_data) + break + # partner_channel.recv(d_data) + # sleep because callbacks not implemented yet + # charm.sleep(0.15) else: if not self.gpu_direct: @@ -52,7 +58,10 @@ def do_iteration(self, message_size, num_iters, done_future): d_data.copy_to_host(h_data) partner_channel.send(h_data) else: - raise NotImplementedError("TODO: GPU Direct") + partner_channel.recv(d_data) + # d_data.copy_to_host(h_data) + # print(h_data[0]) + # partner_channel.send(d_data) tend = time.time() @@ -76,13 +85,19 @@ def display_iteration_data(self, elapsed_time, num_iters, message_size): f'{elapsed_time * 1e6: <20} {bandwidth / 1e6: <20}' ) + +class ArrMap(ArrayMap): + def procNum(self, index): + return index[0] % 2 + + def main(args): if len(args) < 7: print("Doesn't have the required input params. Usage:" " " " \n" + "regular)> \n" ) charm.exit(-1) @@ -93,7 +108,8 @@ def main(args): print_format = int(args[5]) use_gpudirect = int(args[6]) - pings = Group(Ping, args=[use_gpudirect, print_format]) + peMap = Group(ArrMap) + pings = Array(Ping, 2, args=[use_gpudirect, print_format], map = peMap) charm.awaitCreation(pings) msg_size = min_msg_size From fa2072596bb3f82ecd2f5913259dacf1c253bcc5 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 18 Jan 2021 17:23:27 -0500 Subject: [PATCH 022/107] fix datatype passed to CkGetGPUDirectData --- charm4py/charmlib/charmlib_cython.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index f62be726..8801fcc2 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -908,10 +908,10 @@ class CharmLib(object): remote_buf_ptrs[idx] = remote_bufs[idx][1] stream_ptrs_forc[idx] = stream_ptrs[idx] # what do we do about the return future? Need to turn it into some callback. - CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr, + CkGetGPUDirectData(num_buffers, recv_buf_ptrs[0], recv_buf_sizes.data.as_voidptr, - remote_buf_ptrs.data.as_voidptr, - stream_ptrs_forc.data.as_voidptr, + remote_buf_ptrs[0], + stream_ptrs_forc[0], future_id ) @@ -978,7 +978,7 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD for idx in range(numDevBuffs): # Add the size of this buffer and a pointer to it to the info list devBufInfo.append((devBufSizes[idx], - (devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx))) + devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx)) ) recv_buffer.setMsg(msg, msgSize) charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start) From a952d25f520e13d1c1d71430ff31d659f3ff7d02 Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Mon, 18 Jan 2021 18:35:52 -0500 Subject: [PATCH 023/107] Temporary fix for requiring libmpi.so --- charm4py/charm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/charm4py/charm.py b/charm4py/charm.py index 6bb3f28a..fea8a48f 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -111,11 +111,13 @@ def __init__(self): self.options.interactive.verbose = 1 self.options.interactive.broadcast_imports = True + ''' if 'OMPI_COMM_WORLD_SIZE' in os.environ: # this is needed for OpenMPI, see: # https://svn.open-mpi.org/trac/ompi/wiki/Linkers import ctypes self.__libmpi__ = ctypes.CDLL('libmpi.so', mode=ctypes.RTLD_GLOBAL) + ''' self.lib = load_charm_library(self) self.ReducerType = self.lib.ReducerType self.CkContributeToChare = self.lib.CkContributeToChare From 266b19f9917e42d04db9e45f870bc2b38a1f6b5d Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Mon, 18 Jan 2021 19:01:11 -0500 Subject: [PATCH 024/107] Remove break from GPU pingpong benchmark --- tests/benchmark/pingpong_gpu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 30310db7..1e7a8eca 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -47,7 +47,6 @@ def do_iteration(self, message_size, num_iters, done_future): d_data.copy_to_device(partner_channel.recv()) else: partner_channel.send(d_data) - break # partner_channel.recv(d_data) # sleep because callbacks not implemented yet # charm.sleep(0.15) From 25d3ccedadddde5c9bed46d5b7ed7df769a699ce Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 18 Jan 2021 17:23:27 -0500 Subject: [PATCH 025/107] fix datatype passed to CkGetGPUDirectData --- charm4py/charmlib/charmlib_cython.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index f62be726..8801fcc2 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -908,10 +908,10 @@ class CharmLib(object): remote_buf_ptrs[idx] = remote_bufs[idx][1] stream_ptrs_forc[idx] = stream_ptrs[idx] # what do we do about the return future? Need to turn it into some callback. - CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr, + CkGetGPUDirectData(num_buffers, recv_buf_ptrs[0], recv_buf_sizes.data.as_voidptr, - remote_buf_ptrs.data.as_voidptr, - stream_ptrs_forc.data.as_voidptr, + remote_buf_ptrs[0], + stream_ptrs_forc[0], future_id ) @@ -978,7 +978,7 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD for idx in range(numDevBuffs): # Add the size of this buffer and a pointer to it to the info list devBufInfo.append((devBufSizes[idx], - (devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx))) + devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx)) ) recv_buffer.setMsg(msg, msgSize) charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start) From 663ac5a26000da96e267064e109f9f25067fc17e Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Wed, 20 Jan 2021 00:23:47 -0500 Subject: [PATCH 026/107] Debugging, fixed future ID and passing pointers to CkGetGPUDirectData --- charm4py/channel.py | 1 + charm4py/charm.py | 2 ++ charm4py/charmlib/charmlib_cython.pyx | 19 +++++++++++++++++-- tests/benchmark/pingpong_gpu.py | 2 ++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index c7883a7f..08bd1696 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -75,6 +75,7 @@ def recv(self, *post_buffers, stream_ptrs = None): ret = ret[:-1:1] assert len(post_buffers) == len(gpu_recv_bufs) recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) + print('recv_future fid', recv_future.fid) recv_future.get() return ret diff --git a/charm4py/charm.py b/charm4py/charm.py index fea8a48f..fc86fef6 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -316,6 +316,7 @@ def recvArrayMsg(self, aid, index, ep, msg, dcopy_start): def recvGPUDirectMsg(self, aid, index, ep, devBuf_ptrs, msg, dcopy_start ): + print('recvGPUDirectMsg') obj = self.arrays[aid][index] header, args = self.unpackMsg(msg, dcopy_start, obj) args.append(devBuf_ptrs) @@ -350,6 +351,7 @@ def unpackMsg(self, msg, dcopy_start, dest_obj): return header, args def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs): + print('getGPUDirectData') return_fut = self.Future(len(post_buffers)) post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers] if not stream_ptrs: diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 8801fcc2..8f2a0a51 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -465,6 +465,7 @@ class CharmLib(object): def CkArraySendWithDeviceData(self, int array_id, index not None, int ep, msg not None, list stream_ptrs): + print('CkArraySendWithDeviceData') global gpu_direct_buf_idx cdef int i = 0 cdef int ndims = len(index) @@ -887,8 +888,10 @@ class CharmLib(object): CcdCallFnAfter(CcdCallFnAfterCallback, tag, msecs) def getGPUDirectData(self, list post_buf_data, list remote_bufs, list stream_ptrs, return_fut): + print('charmlib getGPUDirectData') cdef int num_buffers = len(post_buf_data) cdef int *future_id = malloc(sizeof(int)) + future_id[0] = return_fut.fid cdef array.array int_array_template = array.array('i', []) cdef array.array long_array_template = array.array('L', []) cdef array.array recv_buf_sizes @@ -907,13 +910,22 @@ class CharmLib(object): recv_buf_sizes[idx] = remote_bufs[idx][0] remote_buf_ptrs[idx] = remote_bufs[idx][1] stream_ptrs_forc[idx] = stream_ptrs[idx] - # what do we do about the return future? Need to turn it into some callback. + print("Op %d: dest ptr: %02x, size: %d, src DeviceBuffer ptr: %02x, stream ptr: %02x\n" %(idx, recv_buf_ptrs[idx], recv_buf_sizes[idx], remote_buf_ptrs[idx], stream_ptrs_forc[idx])); + # what do we do about the return future? Need to turn it into some callback. + ''' CkGetGPUDirectData(num_buffers, recv_buf_ptrs[0], recv_buf_sizes.data.as_voidptr, remote_buf_ptrs[0], stream_ptrs_forc[0], future_id ) + ''' + CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr, + recv_buf_sizes.data.as_voidptr, + remote_buf_ptrs.data.as_voidptr, + stream_ptrs_forc.data.as_voidptr, + future_id + ) # first callback from Charm++ shared library cdef void registerMainModule(): @@ -1047,7 +1059,10 @@ cdef void depositFutureWithId(void *param, void *msg): # TODO: Figure out how this param value should be allocated/deallocated cdef int futureId = ( param)[0] free(param) - charm._future_deposit_result(futureId) + #charm._future_deposit_result(futureId) + print('depositFutureWithId', futureId) + charm.thisProxy[1]._future_deposit_result(futureId) + cdef void createCallbackMsg(void *data, int dataSize, int reducerType, int fid, int *sectionInfo, char **returnBuffers, int *returnBufferSizes): diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 1e7a8eca..6ae0caa9 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -46,6 +46,7 @@ def do_iteration(self, message_size, num_iters, done_future): partner_channel.send(h_data) d_data.copy_to_device(partner_channel.recv()) else: + print("Sending ", hex(d_data.device_ctypes_pointer.value)) partner_channel.send(d_data) # partner_channel.recv(d_data) # sleep because callbacks not implemented yet @@ -57,6 +58,7 @@ def do_iteration(self, message_size, num_iters, done_future): d_data.copy_to_host(h_data) partner_channel.send(h_data) else: + print("Receiving ", hex(d_data.device_ctypes_pointer.value)) partner_channel.recv(d_data) # d_data.copy_to_host(h_data) # print(h_data[0]) From 9000ec3710984ee04cf60d6686050c66b603a3ab Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 20 Jan 2021 16:18:40 -0500 Subject: [PATCH 027/107] call local charm object, not charm remote when depositing GPU recv future --- charm4py/charm.py | 4 ++++ charm4py/charmlib/charmlib_cython.pyx | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index fc86fef6..82968833 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -359,6 +359,10 @@ def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs): self.lib.getGPUDirectData(post_buf_data, remote_bufs, stream_ptrs, return_fut) return return_fut + # deposit value of one of the futures that was created on this PE + def _future_deposit_result(self, fid, result=None): + self.threadMgr.depositFuture(fid, result) + def packMsg(self, destObj, msgArgs, header): """Prepares a message for sending, given arguments to an entry method invocation. diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 8f2a0a51..c3912119 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -1061,7 +1061,7 @@ cdef void depositFutureWithId(void *param, void *msg): free(param) #charm._future_deposit_result(futureId) print('depositFutureWithId', futureId) - charm.thisProxy[1]._future_deposit_result(futureId) + charm._future_deposit_result(futureId) cdef void createCallbackMsg(void *data, int dataSize, int reducerType, int fid, int *sectionInfo, From baa180b3f9e39148256918bb67f4f2ad01e129ee Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Wed, 20 Jan 2021 23:58:19 -0500 Subject: [PATCH 028/107] Remove debugging print statements --- charm4py/channel.py | 1 - charm4py/charm.py | 2 -- charm4py/charmlib/charmlib_cython.pyx | 13 ------------- tests/benchmark/pingpong_gpu.py | 2 -- 4 files changed, 18 deletions(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index 08bd1696..c7883a7f 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -75,7 +75,6 @@ def recv(self, *post_buffers, stream_ptrs = None): ret = ret[:-1:1] assert len(post_buffers) == len(gpu_recv_bufs) recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) - print('recv_future fid', recv_future.fid) recv_future.get() return ret diff --git a/charm4py/charm.py b/charm4py/charm.py index 82968833..a37fd7a8 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -316,7 +316,6 @@ def recvArrayMsg(self, aid, index, ep, msg, dcopy_start): def recvGPUDirectMsg(self, aid, index, ep, devBuf_ptrs, msg, dcopy_start ): - print('recvGPUDirectMsg') obj = self.arrays[aid][index] header, args = self.unpackMsg(msg, dcopy_start, obj) args.append(devBuf_ptrs) @@ -351,7 +350,6 @@ def unpackMsg(self, msg, dcopy_start, dest_obj): return header, args def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs): - print('getGPUDirectData') return_fut = self.Future(len(post_buffers)) post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers] if not stream_ptrs: diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index c3912119..f20155b1 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -465,7 +465,6 @@ class CharmLib(object): def CkArraySendWithDeviceData(self, int array_id, index not None, int ep, msg not None, list stream_ptrs): - print('CkArraySendWithDeviceData') global gpu_direct_buf_idx cdef int i = 0 cdef int ndims = len(index) @@ -888,7 +887,6 @@ class CharmLib(object): CcdCallFnAfter(CcdCallFnAfterCallback, tag, msecs) def getGPUDirectData(self, list post_buf_data, list remote_bufs, list stream_ptrs, return_fut): - print('charmlib getGPUDirectData') cdef int num_buffers = len(post_buf_data) cdef int *future_id = malloc(sizeof(int)) future_id[0] = return_fut.fid @@ -910,16 +908,7 @@ class CharmLib(object): recv_buf_sizes[idx] = remote_bufs[idx][0] remote_buf_ptrs[idx] = remote_bufs[idx][1] stream_ptrs_forc[idx] = stream_ptrs[idx] - print("Op %d: dest ptr: %02x, size: %d, src DeviceBuffer ptr: %02x, stream ptr: %02x\n" %(idx, recv_buf_ptrs[idx], recv_buf_sizes[idx], remote_buf_ptrs[idx], stream_ptrs_forc[idx])); # what do we do about the return future? Need to turn it into some callback. - ''' - CkGetGPUDirectData(num_buffers, recv_buf_ptrs[0], - recv_buf_sizes.data.as_voidptr, - remote_buf_ptrs[0], - stream_ptrs_forc[0], - future_id - ) - ''' CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr, recv_buf_sizes.data.as_voidptr, remote_buf_ptrs.data.as_voidptr, @@ -1059,8 +1048,6 @@ cdef void depositFutureWithId(void *param, void *msg): # TODO: Figure out how this param value should be allocated/deallocated cdef int futureId = ( param)[0] free(param) - #charm._future_deposit_result(futureId) - print('depositFutureWithId', futureId) charm._future_deposit_result(futureId) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 6ae0caa9..1e7a8eca 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -46,7 +46,6 @@ def do_iteration(self, message_size, num_iters, done_future): partner_channel.send(h_data) d_data.copy_to_device(partner_channel.recv()) else: - print("Sending ", hex(d_data.device_ctypes_pointer.value)) partner_channel.send(d_data) # partner_channel.recv(d_data) # sleep because callbacks not implemented yet @@ -58,7 +57,6 @@ def do_iteration(self, message_size, num_iters, done_future): d_data.copy_to_host(h_data) partner_channel.send(h_data) else: - print("Receiving ", hex(d_data.device_ctypes_pointer.value)) partner_channel.recv(d_data) # d_data.copy_to_host(h_data) # print(h_data[0]) From 1061a1127b59fb6c4da8c133ee19e8c355d1d7d5 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 21 Jan 2021 10:09:11 -0500 Subject: [PATCH 029/107] put the pong back in the benchmark --- tests/benchmark/pingpong_gpu.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 1e7a8eca..3e5ce291 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -47,9 +47,7 @@ def do_iteration(self, message_size, num_iters, done_future): d_data.copy_to_device(partner_channel.recv()) else: partner_channel.send(d_data) - # partner_channel.recv(d_data) - # sleep because callbacks not implemented yet - # charm.sleep(0.15) + partner_channel.recv(d_data) else: if not self.gpu_direct: @@ -58,9 +56,7 @@ def do_iteration(self, message_size, num_iters, done_future): partner_channel.send(h_data) else: partner_channel.recv(d_data) - # d_data.copy_to_host(h_data) - # print(h_data[0]) - # partner_channel.send(d_data) + partner_channel.send(d_data) tend = time.time() @@ -112,6 +108,12 @@ def main(args): charm.awaitCreation(pings) msg_size = min_msg_size + # do a warmup iteration (should this be done for each size?) + done_future = Future() + pings.do_iteration(msg_size, high_iter, done_future) + done_future.get() + + while msg_size <= max_msg_size: if msg_size <= 1048576: iter = low_iter From a6c3684128e8712784899283eb707ef2c3ea0510 Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Thu, 21 Jan 2021 13:07:47 -0500 Subject: [PATCH 030/107] Separate send and recv buffers in GPU pingpong --- tests/benchmark/pingpong_gpu.py | 35 +++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 3e5ce291..c90b0d0a 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -25,14 +25,19 @@ def __init__(self, use_gpudirect, print_format): @coro def do_iteration(self, message_size, num_iters, done_future): if USE_PINNED: - h_data = cuda.pinned_array(message_size, dtype='int8') + h_data_send = cuda.pinned_array(message_size, dtype='int8') + h_data_recv = cuda.pinned_array(message_size, dtype='int8') else: if self.am_low_chare: - h_data = np.ones(message_size, dtype='int8') + h_data_send = np.ones(message_size, dtype='int8') + h_data_recv = np.ones(message_size, dtype='int8') else: - h_data = np.zeros(message_size, dtype='int8') - d_data = cuda.device_array(message_size, dtype='int8') - d_data.copy_to_device(h_data) + h_data_send = np.zeros(message_size, dtype='int8') + h_data_recv = np.zeros(message_size, dtype='int8') + d_data_send = cuda.device_array(message_size, dtype='int8') + d_data_recv = cuda.device_array(message_size, dtype='int8') + d_data_send.copy_to_device(h_data_send) + d_data_recv.copy_to_device(h_data_recv) partner_idx = int(not self.thisIndex[0]) partner = self.thisProxy[partner_idx] partner_channel = Channel(self, partner) @@ -42,21 +47,21 @@ def do_iteration(self, message_size, num_iters, done_future): for _ in range(num_iters): if self.am_low_chare: if not self.gpu_direct: - d_data.copy_to_host(h_data) - partner_channel.send(h_data) - d_data.copy_to_device(partner_channel.recv()) + d_data_send.copy_to_host(h_data_send) + partner_channel.send(h_data_send) + d_data_recv.copy_to_device(partner_channel.recv()) else: - partner_channel.send(d_data) - partner_channel.recv(d_data) + partner_channel.send(d_data_send) + partner_channel.recv(d_data_recv) else: if not self.gpu_direct: - d_data.copy_to_device(partner_channel.recv()) - d_data.copy_to_host(h_data) - partner_channel.send(h_data) + d_data_recv.copy_to_device(partner_channel.recv()) + d_data_send.copy_to_host(h_data_send) + partner_channel.send(h_data_send) else: - partner_channel.recv(d_data) - partner_channel.send(d_data) + partner_channel.recv(d_data_recv) + partner_channel.send(d_data_send) tend = time.time() From 3d2fd1544b30bf73fd6fa17d5bb10ff42edb3df5 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sat, 23 Jan 2021 11:24:19 -0500 Subject: [PATCH 031/107] use pinned memory by default, add warmup iterations --- tests/benchmark/pingpong_gpu.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index c90b0d0a..27ce0a4f 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -3,7 +3,8 @@ import numpy as np from numba import cuda -USE_PINNED = False +USE_PINNED = True +WARMUP_ITERS = 10 class Ping(Chare): def __init__(self, use_gpudirect, print_format): @@ -34,6 +35,7 @@ def do_iteration(self, message_size, num_iters, done_future): else: h_data_send = np.zeros(message_size, dtype='int8') h_data_recv = np.zeros(message_size, dtype='int8') + d_data_send = cuda.device_array(message_size, dtype='int8') d_data_recv = cuda.device_array(message_size, dtype='int8') d_data_send.copy_to_device(h_data_send) @@ -42,6 +44,26 @@ def do_iteration(self, message_size, num_iters, done_future): partner = self.thisProxy[partner_idx] partner_channel = Channel(self, partner) + for _ in range(WARMUP_ITERS): + if self.am_low_chare: + if not self.gpu_direct: + d_data_send.copy_to_host(h_data_send) + partner_channel.send(h_data_send) + d_data_recv.copy_to_device(partner_channel.recv()) + else: + partner_channel.send(d_data_send) + partner_channel.recv(d_data_recv) + + else: + if not self.gpu_direct: + d_data_recv.copy_to_device(partner_channel.recv()) + d_data_send.copy_to_host(h_data_send) + partner_channel.send(h_data_send) + else: + partner_channel.recv(d_data_recv) + partner_channel.send(d_data_send) + + tstart = time.time() for _ in range(num_iters): From 36777386f91b3eac13d7bfe6d104f9f496a2f55b Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sat, 23 Jan 2021 11:26:56 -0500 Subject: [PATCH 032/107] remove warmup for only the first size --- tests/benchmark/pingpong_gpu.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 27ce0a4f..e4474e88 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -135,12 +135,6 @@ def main(args): charm.awaitCreation(pings) msg_size = min_msg_size - # do a warmup iteration (should this be done for each size?) - done_future = Future() - pings.do_iteration(msg_size, high_iter, done_future) - done_future.get() - - while msg_size <= max_msg_size: if msg_size <= 1048576: iter = low_iter From aac9651991efa5ea5b8614f78814ec1d9d1d8b15 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sat, 23 Jan 2021 18:24:41 -0500 Subject: [PATCH 033/107] send, recv can now have the pointers set directly --- charm4py/channel.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index c7883a7f..e187f19f 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -1,5 +1,6 @@ from .threads import LocalFuture from .charm import charm +import time class Channel(object): @@ -53,15 +54,15 @@ def ready(self): def waitReady(self, f): self.wait_ready = f - def send(self, *msg): + def send(self, *msg, **kwargs): if not self.established: self.established_fut = LocalFuture() self.established_fut.get() self.setEstablished() - self.remote._channelRecv__(self.remote_port, self.send_seqno, *msg) + self.remote._channelRecv__(self.remote_port, self.send_seqno, *msg, **kwargs) self.send_seqno = (self.send_seqno + 1) % CHAN_BUF_SIZE - def recv(self, *post_buffers, stream_ptrs = None): + def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None, stream_ptrs = None): if self.recv_seqno in self.data: ret = self.data.pop(self.recv_seqno) else: @@ -72,10 +73,19 @@ def recv(self, *post_buffers, stream_ptrs = None): if post_buffers: gpu_recv_bufs = ret[-1] - ret = ret[:-1:1] + # ret = ret[:-1:1] assert len(post_buffers) == len(gpu_recv_bufs) recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) recv_future.get() + elif post_buf_addresses != None: + gpu_recv_bufs = ret + # ret = ret[:-1:1] + assert len(post_buffers) == len(gpu_recv_bufs) + assert post_buf_sizes + recv_future = charm.getGPUDirectDataFromAddresses(post_buf_addresses, post_buf_sizes, gpu_recv_bufs, stream_ptrs) + recv_future.get() + + return ret From 8e1c02fd7d87f8853d3e5a8622b98ccab1e7d9d9 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sat, 23 Jan 2021 18:25:06 -0500 Subject: [PATCH 034/107] add calls to CkArraySendWithDeviceDataFromPointers when applicable --- charm4py/chare.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/charm4py/chare.py b/charm4py/chare.py index 58ab2c06..35d53bd8 100644 --- a/charm4py/chare.py +++ b/charm4py/chare.py @@ -722,7 +722,7 @@ def proxy_entry_method(proxy, *args, **kwargs): argname = argnames[i] # first look for argument in kwargs # TODO: Should stream_ptrs be skipped? - if argname == 'stream_ptrs': + if argname in {'stream_ptrs', 'gpu_src_ptrs', 'gpu_src_sizes'}: continue if argname in kwargs and argname: args.append(kwargs[argname]) @@ -751,15 +751,27 @@ def proxy_entry_method(proxy, *args, **kwargs): array = charm.arrays[aid] if elemIdx in array: destObj = array[elemIdx] - msg = charm.packMsg(destObj, args, header) - if msg[1]: - if 'stream_ptrs' in kwargs and kwargs['stream_ptrs']: + should_pack_gpu = True + if 'gpu_src_ptrs' in kwargs: + should_pack_gpu = False + msg = charm.packMsg(destObj, args, header, pack_gpu = should_pack_gpu) + if msg[1] or not should_pack_gpu: + if 'stream_ptrs' in kwargs: stream_ptrs = kwargs['stream_ptrs'] else: stream_ptrs = None - charm.CkArraySendWithDeviceData(aid, elemIdx, ep, - msg, stream_ptrs - ) + if should_pack_gpu: + charm.CkArraySendWithDeviceData(aid, elemIdx, ep, + msg, stream_ptrs + ) + else: + charm.CkArraySendWithDeviceDataFromPointers(aid, elemIdx, ep, + msg, kwargs['gpu_src_ptrs'], + kwargs['gpu_src_sizes'], + stream_ptrs, + len(kwargs['gpu_src_ptrs']) + ) + else: charm.CkArraySend(aid, elemIdx, ep, msg) else: From 71004957dc47eee95b98546ebd8fe2fb48eb6ee2 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sat, 23 Jan 2021 18:25:48 -0500 Subject: [PATCH 035/107] update DirectCopy API so buffer info is not always gathered --- charm4py/charm.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index a37fd7a8..f1e87523 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -28,6 +28,7 @@ from . import reduction from . import wait import array +import numpy as np try: import numpy except ImportError: @@ -46,6 +47,9 @@ def getDeviceDataInfo(devArray): def getDeviceDataAddress(devArray): return getDeviceDataInfo(devArray)[0] +def getDeviceDataSizeInBytes(devArray): + return devArray.nbytes + class Options(object): def __str__(self): @@ -128,6 +132,7 @@ def __init__(self): self.CkGroupSend = self.lib.CkGroupSend self.CkArraySend = self.lib.CkArraySend self.CkArraySendWithDeviceData = self.lib.CkArraySendWithDeviceData + self.CkArraySendWithDeviceDataFromPointers = self.lib.CkArraySendWithDeviceDataFromPointers self.reducers = reduction.ReducerContainer(self) self.redMgr = reduction.ReductionManager(self, self.reducers) self.mainchareRegistered = False @@ -319,6 +324,7 @@ def recvGPUDirectMsg(self, aid, index, ep, obj = self.arrays[aid][index] header, args = self.unpackMsg(msg, dcopy_start, obj) args.append(devBuf_ptrs) + self.invokeEntryMethod(obj, ep, header, args) def recvArrayBcast(self, aid, indexes, ep, msg, dcopy_start): @@ -352,9 +358,17 @@ def unpackMsg(self, msg, dcopy_start, dest_obj): def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs): return_fut = self.Future(len(post_buffers)) post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers] + post_buf_sizes = [getDeviceDataSizeInBytes(buf) for buf in post_buffers] if not stream_ptrs: stream_ptrs = [0] * len(post_buffers) - self.lib.getGPUDirectData(post_buf_data, remote_bufs, stream_ptrs, return_fut) + self.lib.getGPUDirectData(post_buf_data, post_buf_sizes, remote_bufs, stream_ptrs, return_fut) + return return_fut + + def getGPUDirectDataFromAddresses(self, post_buf_ptrs, post_buf_sizes, remote_bufs, stream_ptrs): + return_fut = self.Future(len(post_buf_ptrs)) + if not stream_ptrs: + stream_ptrs = array.array('L', [0] * len(post_buf_ptrs)) + self.lib.getGPUDirectDataFromAddresses(post_buf_ptrs, post_buf_sizes, remote_bufs, stream_ptrs, return_fut) return return_fut # deposit value of one of the futures that was created on this PE From 61603ad0b5e7867746eb866c86f2446772251ed7 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sat, 23 Jan 2021 18:31:17 -0500 Subject: [PATCH 036/107] update CkGetGPUDirectData to use future value --- charm4py/charmlib/ccharm.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index 7f2a34b2..ce06826d 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -83,7 +83,7 @@ cdef extern from "charm.h": void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int)); void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes, - void *remoteBufInfo, void *streamPtrs, int *futureId); + void *remoteBufInfo, void *streamPtrs, int futureId); int CkDeviceBufferSizeInBytes(); From 30a5340385f8830db06a197f074c5546da42e974 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sat, 23 Jan 2021 18:31:46 -0500 Subject: [PATCH 037/107] GPUDirect buffers can now come from device pointers --- charm4py/charmlib/charmlib_cython.pyx | 82 ++++++++++++++++++++------- 1 file changed, 61 insertions(+), 21 deletions(-) diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index f20155b1..2eb7cde4 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -492,6 +492,34 @@ class CharmLib(object): ) gpu_direct_buf_idx = 0 + def CkArraySendWithDeviceDataFromPointers(self, int array_id, index not None, int ep, + msg not None, array.array gpu_src_ptrs, + array.array gpu_src_sizes, + list stream_ptrs, int num_bufs): + + cdef int i = 0 + cdef int ndims = len(index) + # assert ndims == 1 + for i in range(ndims): c_index[i] = index[i] + msg0, dcopy = msg + dcopy = None + + if stream_ptrs: + for i in range(num_bufs): + gpu_direct_stream_ptrs[i] = stream_ptrs[i] + else: + memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs) + + CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0), + gpu_src_ptrs.data.as_voidptr, + gpu_src_sizes.data.as_voidptr, + gpu_direct_stream_ptrs, + num_bufs + ) + gpu_direct_buf_idx = 0 + + + def CkArraySend(self, int array_id, index not None, int ep, msg not None): global cur_buf msg0, dcopy = msg @@ -816,7 +844,7 @@ class CharmLib(object): return header, args - def packMsg(self, destObj, msgArgs not None, dict header): + def packMsg(self, destObj, msgArgs not None, dict header, pack_gpu=True): cdef int i = 0 cdef int localTag cdef array.array a @@ -841,13 +869,15 @@ class CharmLib(object): for i in range(len(args)): arg = msgArgs[i] if CkCudaEnabled() and hasattr(arg, '__cuda_array_interface__'): - # we want to take the args that implement the cuda array interface and make them into ckdevicebuffers - # assumption: we can get nbytes from the arg directly - # TODO: verify this assertion for other types - gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg.__cuda_array_interface__['data'][0] - gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes - cuda_dev_info = True - gpu_direct_buf_idx += 1 + if pack_gpu: + # we want to take the args that implement the cuda array interface and make them into ckdevicebuffers + # assumption: we can get nbytes from the arg directly + # TODO: verify this assertion for other types + # gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg.__cuda_array_interface__['data'][0] + gpu_direct_device_ptrs[gpu_direct_buf_idx] = arg.__cuda_array_interface__['data'][0] + gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes + cuda_dev_info = True + gpu_direct_buf_idx += 1 args[i] = None # TODO: should this be done? continue elif isinstance(arg, np.ndarray) and not arg.dtype.hasobject: @@ -886,10 +916,10 @@ class CharmLib(object): def scheduleTagAfter(self, int tag, double msecs): CcdCallFnAfter(CcdCallFnAfterCallback, tag, msecs) - def getGPUDirectData(self, list post_buf_data, list remote_bufs, list stream_ptrs, return_fut): + + def getGPUDirectData(self, list post_buf_data, list post_buf_sizes, array.array remote_bufs, list stream_ptrs, return_fut): cdef int num_buffers = len(post_buf_data) - cdef int *future_id = malloc(sizeof(int)) - future_id[0] = return_fut.fid + cdef int future_id = return_fut.fid cdef array.array int_array_template = array.array('i', []) cdef array.array long_array_template = array.array('L', []) cdef array.array recv_buf_sizes @@ -905,8 +935,8 @@ class CharmLib(object): for idx in range(num_buffers): recv_buf_ptrs[idx] = post_buf_data[idx] - recv_buf_sizes[idx] = remote_bufs[idx][0] - remote_buf_ptrs[idx] = remote_bufs[idx][1] + recv_buf_sizes[idx] = post_buf_sizes[idx] + remote_buf_ptrs[idx] = remote_bufs[idx] stream_ptrs_forc[idx] = stream_ptrs[idx] # what do we do about the return future? Need to turn it into some callback. CkGetGPUDirectData(num_buffers, recv_buf_ptrs.data.as_voidptr, @@ -916,6 +946,18 @@ class CharmLib(object): future_id ) + def getGPUDirectDataFromAddresses(self, array.array post_buf_ptrs, array.array post_buf_sizes, array.array remote_bufs, array.array stream_ptrs, return_fut): + cdef int num_buffers = len(post_buf_ptrs) + cdef int future_id = return_fut.fid + # pointers from the remote that we will be issuing Rgets for + # these are pointers to type CkDeviceBuffer + CkGetGPUDirectData(num_buffers, post_buf_ptrs.data.as_voidptr, + post_buf_sizes.data.as_voidptr, + remote_bufs.data.as_voidptr, + stream_ptrs.data.as_voidptr, + future_id + ) + # first callback from Charm++ shared library cdef void registerMainModule(): try: @@ -970,19 +1012,19 @@ cdef void recvArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int msgSize, cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numDevBuffs, long *devBufSizes, void *devBufs, int msgSize, char *msg, int dcopy_start): + cdef int idx = 0 try: if PROFILING: charm._precvtime = time.time() charm.recordReceive(msgSize) - devBufInfo = [] + devBufInfo = array.array('L', [0] * numDevBuffs) for idx in range(numDevBuffs): - # Add the size of this buffer and a pointer to it to the info list - devBufInfo.append((devBufSizes[idx], - devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx)) - ) + # Add the buffer's address to the list + devBufInfo[idx] = devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx) recv_buffer.setMsg(msg, msgSize) charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start) + except: charm.handleGeneralError() @@ -1045,9 +1087,7 @@ cdef void resumeFromSync(int aid, int ndims, int *arrayIndex): charm.handleGeneralError() cdef void depositFutureWithId(void *param, void *msg): -# TODO: Figure out how this param value should be allocated/deallocated - cdef int futureId = ( param)[0] - free(param) + cdef int futureId = param charm._future_deposit_result(futureId) From 882eb986fe045db15141599cda911a77a9063bc2 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sat, 23 Jan 2021 18:32:02 -0500 Subject: [PATCH 038/107] add benchmark with new address optimization --- tests/benchmark/pingpong_gpu.py | 54 +++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index e4474e88..8c1e9e1a 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -2,8 +2,11 @@ import time import numpy as np from numba import cuda +import array USE_PINNED = True +# provide the address/size data for GPU-direct addresses. Saves ~11us per iteration +USE_ADDRESS_OPTIMIZATION = True WARMUP_ITERS = 10 class Ping(Chare): @@ -44,46 +47,51 @@ def do_iteration(self, message_size, num_iters, done_future): partner = self.thisProxy[partner_idx] partner_channel = Channel(self, partner) - for _ in range(WARMUP_ITERS): - if self.am_low_chare: - if not self.gpu_direct: - d_data_send.copy_to_host(h_data_send) - partner_channel.send(h_data_send) - d_data_recv.copy_to_device(partner_channel.recv()) - else: - partner_channel.send(d_data_send) - partner_channel.recv(d_data_recv) + if USE_ADDRESS_OPTIMIZATION: + d_data_recv_addr = array.array('L', [0]) + d_data_recv_size = array.array('L', [0]) + d_data_send_addr = array.array('L', [0]) + d_data_send_size = array.array('L', [0]) - else: - if not self.gpu_direct: - d_data_recv.copy_to_device(partner_channel.recv()) - d_data_send.copy_to_host(h_data_send) - partner_channel.send(h_data_send) - else: - partner_channel.recv(d_data_recv) - partner_channel.send(d_data_send) + d_data_recv_addr[0] = d_data_recv.__cuda_array_interface__['data'][0] + d_data_recv_size[0] = d_data_recv.nbytes + d_data_send_addr[0] = d_data_send.__cuda_array_interface__['data'][0] + d_data_send_size[0] = d_data_send.nbytes tstart = time.time() - for _ in range(num_iters): + for iternum in range(num_iters + WARMUP_ITERS): + if iternum == WARMUP_ITERS: + tstart = time.time() if self.am_low_chare: if not self.gpu_direct: d_data_send.copy_to_host(h_data_send) partner_channel.send(h_data_send) d_data_recv.copy_to_device(partner_channel.recv()) else: - partner_channel.send(d_data_send) - partner_channel.recv(d_data_recv) - + if USE_ADDRESS_OPTIMIZATION: + partner_channel.send(gpu_src_ptrs = d_data_send_addr, gpu_src_sizes = d_data_send_size) + partner_channel.recv(post_buf_addresses = d_data_recv_addr, + post_buf_sizes = d_data_recv_size + ) + else: + partner_channel.send(d_data_send) + partner_channel.recv(d_data_recv) else: if not self.gpu_direct: d_data_recv.copy_to_device(partner_channel.recv()) d_data_send.copy_to_host(h_data_send) partner_channel.send(h_data_send) else: - partner_channel.recv(d_data_recv) - partner_channel.send(d_data_send) + if USE_ADDRESS_OPTIMIZATION: + partner_channel.recv(post_buf_addresses = d_data_recv_addr, + post_buf_sizes = d_data_recv_size + ) + partner_channel.send(gpu_src_ptrs = d_data_send_addr, gpu_src_sizes = d_data_send_size) + else: + partner_channel.recv(d_data_recv) + partner_channel.send(d_data_send) tend = time.time() From 4be40c94236cb384c953b919353371f5a02aa4dc Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sun, 24 Jan 2021 14:34:00 -0500 Subject: [PATCH 039/107] update low/high iter breakpoint threshold according to experimental methodology --- tests/benchmark/pingpong_gpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 8c1e9e1a..8a65dcca 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -7,6 +7,7 @@ USE_PINNED = True # provide the address/size data for GPU-direct addresses. Saves ~11us per iteration USE_ADDRESS_OPTIMIZATION = True +LOW_ITER_THRESHOLD = 8192 WARMUP_ITERS = 10 class Ping(Chare): @@ -144,7 +145,7 @@ def main(args): msg_size = min_msg_size while msg_size <= max_msg_size: - if msg_size <= 1048576: + if msg_size <= LOW_ITER_THRESHOLD: iter = low_iter else: iter = high_iter From 8ef56e5ec14a9a29279bea9b1cff290d761de499 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 25 Jan 2021 13:37:31 -0500 Subject: [PATCH 040/107] add hooks to cuda copy functions --- charm4py/charmlib/ccharm.pxd | 10 ++++++---- charm4py/charmlib/charmlib_cython.pyx | 12 +++++++++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index ce06826d..e11c204f 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -1,5 +1,7 @@ # libcharm wrapper for Cython +cdef extern from "cuda_runtime.h": + ctypedef long cudaStream_t cdef extern from "charm.h": @@ -89,6 +91,10 @@ cdef extern from "charm.h": void registerDepositFutureWithIdFn(void (*cb)(void*, void*)); + void CkCUDAHtoD(void *dest, void *src, int nbytes, cudaStream_t stream); + void CkCUDADtoH(void *dest, void *src, int nbytes, cudaStream_t stream); + void CkCUDAStreamSynchronize(cudaStream_t stream); + cdef extern from "spanningTree.h": @@ -96,7 +102,3 @@ cdef extern from "spanningTree.h": int *parent, int *child_count, int **children); -# cdef extern from "ckrdmadevice.h": -# cdef cppclass CkDeviceBuffer: -# CkDeviceBuffer() - diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 2eb7cde4..c360c264 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -957,6 +957,17 @@ class CharmLib(object): stream_ptrs.data.as_voidptr, future_id ) + def CudaHtoD(self, long destAddr, long srcAddr, int nbytes, long streamAddr): + CkCUDAHtoD(destAddr, srcAddr,nbytes, ( streamAddr)[0]); + + def CudaDtoH(self, long destAddr, long srcAddr, int nbytes, long streamAddr): + CkCUDADtoH(destAddr, srcAddr,int(nbytes), ( streamAddr)[0]); + + def CudaStreamSynchronize(self, long streamAddr): + CkCUDAStreamSynchronize((streamAddr)[0]) + + + # first callback from Charm++ shared library cdef void registerMainModule(): @@ -1028,7 +1039,6 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD except: charm.handleGeneralError() - cdef void recvArrayBcast(int aid, int ndims, int nInts, int numElems, int *arrayIndexes, int ep, int msgSize, char *msg, int dcopy_start): cdef int i = 0 try: From f058ab31f084067b34a163eb525b965bfcc5baef Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 25 Jan 2021 13:42:37 -0500 Subject: [PATCH 041/107] don't use slow Numba transfer functionality when using host-staging --- tests/benchmark/pingpong_gpu.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 8a65dcca..5bfafd6e 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -48,7 +48,15 @@ def do_iteration(self, message_size, num_iters, done_future): partner = self.thisProxy[partner_idx] partner_channel = Channel(self, partner) - if USE_ADDRESS_OPTIMIZATION: + my_stream = cuda.stream() + stream_address = my_stream.handle.value + d_data_send_addr = d_data_send.__cuda_array_interface__['data'][0] + h_data_send_addr = h_data_send.__array_interface__['data'][0] + + d_data_recv_addr = d_data_recv.__cuda_array_interface__['data'][0] + h_data_recv_addr = h_data_recv.__array_interface__['data'][0] + + if self.gpu_direct and USE_ADDRESS_OPTIMIZATION: d_data_recv_addr = array.array('L', [0]) d_data_recv_size = array.array('L', [0]) d_data_send_addr = array.array('L', [0]) @@ -67,9 +75,14 @@ def do_iteration(self, message_size, num_iters, done_future): tstart = time.time() if self.am_low_chare: if not self.gpu_direct: - d_data_send.copy_to_host(h_data_send) + charm.lib.CudaDtoH(h_data_send_addr, d_data_send_addr, message_size, stream_address) + charm.lib.CudaStreamSynchronize(stream_address) + partner_channel.send(h_data_send) - d_data_recv.copy_to_device(partner_channel.recv()) + received = partner_channel.recv() + + charm.lib.CudaHtoD(d_data_recv_addr, received.__array_interface__['data'][0], message_size, stream_address) + charm.lib.CudaStreamSynchronize(stream_address) else: if USE_ADDRESS_OPTIMIZATION: partner_channel.send(gpu_src_ptrs = d_data_send_addr, gpu_src_sizes = d_data_send_size) @@ -81,8 +94,12 @@ def do_iteration(self, message_size, num_iters, done_future): partner_channel.recv(d_data_recv) else: if not self.gpu_direct: - d_data_recv.copy_to_device(partner_channel.recv()) - d_data_send.copy_to_host(h_data_send) + received = partner_channel.recv() + + charm.lib.CudaHtoD(d_data_recv_addr, received.__array_interface__['data'][0], message_size, stream_address) + charm.lib.CudaDtoH(h_data_send_addr, d_data_send_addr, message_size, stream_address) + charm.lib.CudaStreamSynchronize(stream_address) + partner_channel.send(h_data_send) else: if USE_ADDRESS_OPTIMIZATION: From 6cad70bd885b1c97f276f111a9e0d39c62434b00 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 25 Jan 2021 15:24:17 -0500 Subject: [PATCH 042/107] host-staging bandwidth test --- tests/benchmark/bandwidth.py | 132 +++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 tests/benchmark/bandwidth.py diff --git a/tests/benchmark/bandwidth.py b/tests/benchmark/bandwidth.py new file mode 100644 index 00000000..a576d6a2 --- /dev/null +++ b/tests/benchmark/bandwidth.py @@ -0,0 +1,132 @@ +from charm4py import charm, Chare, Array, coro, Future, Channel, Group, ArrayMap +import time +import numpy as np +from numba import cuda +import array + +USE_PINNED = True +# provide the address/size data for GPU-direct addresses. Saves ~11us per iteration +USE_ADDRESS_OPTIMIZATION = True +LOW_ITER_THRESHOLD = 8192 +WARMUP_ITERS = 10 + + +class Block(Chare): + def __init__(self, use_gpudirect): + self.gpu_direct = use_gpudirect + self.num_chares = charm.numPes() + self.am_low_chare = self.thisIndex[0] == 0 + + if self.am_low_chare: + print("Msg Size, Iterations, Bandwidth (MB/s)") + + @coro + def do_iteration(self, message_size, windows, num_iters, done_future): + if USE_PINNED: + h_local_data = cuda.pinned_array(message_size, dtype='int8') + h_remote_data = cuda.pinned_array(message_size, dtype='int8') + else: + if self.am_low_chare: + h_local_data = np.ones(message_size, dtype='int8') + h_remote_data = np.ones(message_size, dtype='int8') + else: + h_local_data = np.zeros(message_size, dtype='int8') + h_remote_data = np.zeros(message_size, dtype='int8') + + + d_local_data = cuda.device_array(message_size, dtype='int8') + d_remote_data = cuda.device_array(message_size, dtype='int8') + + my_stream = cuda.stream() + stream_address = my_stream.handle.value + + d_local_data_addr = d_local_data.__cuda_array_interface__['data'][0] + h_local_data_addr = h_local_data.__array_interface__['data'][0] + + d_remote_data_addr = d_remote_data.__cuda_array_interface__['data'][0] + h_remote_data_addr = h_remote_data.__array_interface__['data'][0] + + partner_idx = int(not self.thisIndex[0]) + partner = self.thisProxy[partner_idx] + partner_channel = Channel(self, partner) + + tstart = 0 + + for idx in range(num_iters + WARMUP_ITERS): + if idx == WARMUP_ITERS: + tstart = time.time() + if self.am_low_chare: + if not self.gpu_direct: + for _ in range(windows): + charm.lib.CudaDtoH(h_local_data_addr, d_local_data_addr, message_size, stream_address) + charm.lib.CudaStreamSynchronize(stream_address) + for _ in range(windows): + partner_channel.send(h_local_data) + partner_channel.recv() + else: + pass + else: + if not self.gpu_direct: + for _ in range(windows): + received = partner_channel.recv() + charm.lib.CudaHtoD(d_remote_data_addr, received.__array_interface__['data'][0], + message_size, stream_address + ) + charm.lib.CudaStreamSynchronize(stream_address) + partner_channel.send(1) + else: + pass + + tend = time.time() + elapsed_time = tend - tstart + if self.am_low_chare: + self.display_iteration_data(elapsed_time, num_iters, windows, message_size) + + self.reduce(done_future) + + def display_iteration_data(self, elapsed_time, num_iters, windows, message_size): + data_sent = message_size / 1e6 * num_iters * windows; + print(f'{message_size},{num_iters},{data_sent/elapsed_time}') + + + +class ArrMap(ArrayMap): + def procNum(self, index): + return index[0] % 2 + + +def main(args): + if len(args) < 7: + print("Doesn't have the required input params. Usage:" + " " + " " + "\n" + ) + charm.exit(-1) + + min_msg_size = int(args[1]) + max_msg_size = int(args[2]) + window_size = int(args[3]) + low_iter = int(args[4]) + high_iter = int(args[5]) + use_gpudirect = int(args[6]) + + peMap = Group(ArrMap) + blocks = Array(Block, 2, args=[use_gpudirect], map = peMap) + charm.awaitCreation(blocks) + msg_size = min_msg_size + + while msg_size <= max_msg_size: + if msg_size <= LOW_ITER_THRESHOLD: + iter = low_iter + else: + iter = high_iter + done_future = Future() + blocks.do_iteration(msg_size, window_size, iter, done_future) + done_future.get() + msg_size *= 2 + + charm.exit() + + +charm.start(main) From 9e0ff3121a91b3dc884d7c99c89d3b1bdadd9574 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 25 Jan 2021 15:49:12 -0500 Subject: [PATCH 043/107] add gpudirect bw test --- tests/benchmark/bandwidth.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/benchmark/bandwidth.py b/tests/benchmark/bandwidth.py index a576d6a2..318fe849 100644 --- a/tests/benchmark/bandwidth.py +++ b/tests/benchmark/bandwidth.py @@ -46,6 +46,14 @@ def do_iteration(self, message_size, windows, num_iters, done_future): d_remote_data_addr = d_remote_data.__cuda_array_interface__['data'][0] h_remote_data_addr = h_remote_data.__array_interface__['data'][0] + if self.gpu_direct: + d_local_data_addr = array.array('L', [0]) + d_local_data_size = array.array('L', [0]) + + d_local_data_addr[0] = d_local_data.__cuda_array_interface__['data'][0] + d_local_data_size[0] = d_local_data.nbytes + + partner_idx = int(not self.thisIndex[0]) partner = self.thisProxy[partner_idx] partner_channel = Channel(self, partner) @@ -64,7 +72,11 @@ def do_iteration(self, message_size, windows, num_iters, done_future): partner_channel.send(h_local_data) partner_channel.recv() else: - pass + for _ in range(windows): + partner_channel.send(gpu_src_ptrs = d_local_data_addr, + gpu_src_sizes = d_local_data_size + ) + partner_channel.recv() else: if not self.gpu_direct: for _ in range(windows): @@ -73,9 +85,11 @@ def do_iteration(self, message_size, windows, num_iters, done_future): message_size, stream_address ) charm.lib.CudaStreamSynchronize(stream_address) - partner_channel.send(1) else: - pass + for _ in range(windows): + partner_channel.recv(post_buf_addresses = d_local_data_addr, + post_buf_sizes = d_local_data_size) + partner_channel.send(1) tend = time.time() elapsed_time = tend - tstart From 1752446db21ccb4461e36e58f6c03c3b0546e005 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 25 Jan 2021 19:44:08 -0500 Subject: [PATCH 044/107] remove comment --- charm4py/charm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index f1e87523..334ffd6a 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -334,7 +334,6 @@ def recvArrayBcast(self, aid, indexes, ep, msg, dcopy_start): self.invokeEntryMethod(array[index], ep, header, args) def unpackMsg(self, msg, dcopy_start, dest_obj): - # Issue Rgets for GPU data in unpackMsg? But how does recv work? if msg[:7] == b'_local:': header, args = dest_obj.__removeLocal__(int(msg[7:])) else: From 9582b2a85590aa82b1f1f7154ebf6f0c08fac4d0 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 25 Jan 2021 19:44:39 -0500 Subject: [PATCH 045/107] fix benchmark for non gpu-direct --- tests/benchmark/bandwidth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmark/bandwidth.py b/tests/benchmark/bandwidth.py index 318fe849..d2c082dc 100644 --- a/tests/benchmark/bandwidth.py +++ b/tests/benchmark/bandwidth.py @@ -70,12 +70,12 @@ def do_iteration(self, message_size, windows, num_iters, done_future): charm.lib.CudaStreamSynchronize(stream_address) for _ in range(windows): partner_channel.send(h_local_data) - partner_channel.recv() else: for _ in range(windows): partner_channel.send(gpu_src_ptrs = d_local_data_addr, gpu_src_sizes = d_local_data_size ) + partner_channel.recv() else: if not self.gpu_direct: From 6f955077dfd67fd735f7e5f430e5428fab023fc0 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 25 Jan 2021 20:01:55 -0500 Subject: [PATCH 046/107] separate channels for data/ack --- tests/benchmark/bandwidth.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/benchmark/bandwidth.py b/tests/benchmark/bandwidth.py index d2c082dc..6f013055 100644 --- a/tests/benchmark/bandwidth.py +++ b/tests/benchmark/bandwidth.py @@ -57,6 +57,7 @@ def do_iteration(self, message_size, windows, num_iters, done_future): partner_idx = int(not self.thisIndex[0]) partner = self.thisProxy[partner_idx] partner_channel = Channel(self, partner) + partner_ack_channel = Channel(self, partner) tstart = 0 @@ -68,6 +69,7 @@ def do_iteration(self, message_size, windows, num_iters, done_future): for _ in range(windows): charm.lib.CudaDtoH(h_local_data_addr, d_local_data_addr, message_size, stream_address) charm.lib.CudaStreamSynchronize(stream_address) + # d_local_data.copy_to_host(h_local_data) for _ in range(windows): partner_channel.send(h_local_data) else: @@ -76,7 +78,7 @@ def do_iteration(self, message_size, windows, num_iters, done_future): gpu_src_sizes = d_local_data_size ) - partner_channel.recv() + partner_ack_channel.recv() else: if not self.gpu_direct: for _ in range(windows): @@ -85,11 +87,12 @@ def do_iteration(self, message_size, windows, num_iters, done_future): message_size, stream_address ) charm.lib.CudaStreamSynchronize(stream_address) + # d_local_data.copy_to_device(received) else: for _ in range(windows): partner_channel.recv(post_buf_addresses = d_local_data_addr, post_buf_sizes = d_local_data_size) - partner_channel.send(1) + partner_ack_channel.send(1) tend = time.time() elapsed_time = tend - tstart From c62c621c6e01ab2b85bc5fbaf8defd42eced7bce Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 25 Jan 2021 21:02:21 -0500 Subject: [PATCH 047/107] fix indentation of partner_ack --- tests/benchmark/bandwidth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmark/bandwidth.py b/tests/benchmark/bandwidth.py index 6f013055..69da8180 100644 --- a/tests/benchmark/bandwidth.py +++ b/tests/benchmark/bandwidth.py @@ -78,7 +78,7 @@ def do_iteration(self, message_size, windows, num_iters, done_future): gpu_src_sizes = d_local_data_size ) - partner_ack_channel.recv() + partner_ack_channel.recv() else: if not self.gpu_direct: for _ in range(windows): From 99d4398ee35a49a68051ff60a9fb84525ca084be Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Tue, 26 Jan 2021 12:21:01 -0500 Subject: [PATCH 048/107] initialize jacobi object --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/cuda/gpudirect/jacobi3d/jacobi3d.py diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py new file mode 100644 index 00000000..e69de29b From 121076d91c54567203cd8ca25a098c150548154b Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Tue, 26 Jan 2021 12:45:04 -0500 Subject: [PATCH 049/107] add CLI arguments --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 52 ++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index e69de29b..5de2cc83 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -0,0 +1,52 @@ +from charm4py import * +from numba import cuda +from argparse import ArgumentParser +from enum import Enum + +class Defaults(Enum): + GRID_WIDTH = 512, + GRID_HEIGHT = 512, + GRID_DEPTH = 512, + NUM_ITERS = 512, + WARMUP_ITERS = 10, + USE_ZEROCOPY = False + PRINT_ELEMENTS = False + + +def main(args): + Defaults.NUM_CHARES = charm.numPes() + argp = ArgumentParser(description = "Jacobi3D implementation in Charm4Py using " + "CUDA and GPU-Direct communication" + ) + argp.add_argument('-x', '--grid_width', help = "Grid width", + default = Defaults.GRID_WIDTH.value + ) + argp.add_argument('-y', '--grid_height', help = "Grid height", + default = Defaults.GRID_HEIGHT.value + ) + argp.add_argument('-z', '--grid_depth', help = "Grid depth", + default = Defaults.GRID_DEPTH.value + ) + argp.add_argument('-c', '--num_chares', help = "Number of chares", + default = Defaults.NUM_CHARES + ) + argp.add_argument('-i', '--iterations', help = "Number of iterations", + default = Defaults.NUM_ITERS.value + ) + argp.add_argument('-w', '--warmup_iterations', help = "Number of warmup iterations", + default = Defaults.WARMUP_ITERS.value + ) + argp.add_argument('-d', '--use_zerocopy', action = "store_true", + help = "Use zerocopy when performing data transfers", + default = Defaults.USE_ZEROCOPY.value + ) + argp.add_argument('-p', '--print_blocks', help = "Print blocks", + action = "store_true", + default = Defaults.PRINT_ELEMENTS.value + ) + args = argp.parse_args() + # charm.exit() + +# charm.start(main) +if __name__ == '__main__': + main(None) From b202babd1f5890d01ba439ef4230801bfb4f7945 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Tue, 26 Jan 2021 12:52:41 -0500 Subject: [PATCH 050/107] create file containing cuda kernels --- examples/cuda/gpudirect/jacobi3d/kernels.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/cuda/gpudirect/jacobi3d/kernels.py diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py new file mode 100644 index 00000000..e69de29b From d22cdfb4c5abff49524a1ef6683522d3fbb1e942 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Tue, 26 Jan 2021 13:12:40 -0500 Subject: [PATCH 051/107] initialized a few kernels --- examples/cuda/gpudirect/jacobi3d/kernels.py | 49 +++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py index e69de29b..cbf43515 100644 --- a/examples/cuda/gpudirect/jacobi3d/kernels.py +++ b/examples/cuda/gpudirect/jacobi3d/kernels.py @@ -0,0 +1,49 @@ +from numba import cuda +from numba.cuda import blockDim, blockIdx, threadIdx + +@cuda.jit(device=True) +def IDX(i,j,k, block_width, block_height): + return ((block_width+2)*(block_height+2)*(k)+(block_width+2)*(j)+(i)) + +@cuda.jit +def initKernel(temperature, block_width, block_height, block_depth): + i = blockDim.x * blockIdx.x + threadIdx.x + j = blockDim.y * blockIdx.y + threadIdx.y + k = blockDim.z * blockIdx.z + threadIdx.z + + if i < block_width + 2 and j < block_height + 2 and k < block_depth + 2: + temperature[IDX(i, j, k, block_width, block_height)] = 0 + +@cuda.jit +def ghostInitKernel(ghost, ghost_count): + i = blockDim.x * blockIdx.x + threadIdx.x + if i < ghost_count: + ghost[i] = 0 + +@cuda.jit +def leftBoundaryKernel(temperature, block_width, block_height, block_depth): + j = blockDim.x * blockIdx.x + threadIdx.x + k = blockDim.y * blockIdx.y + threadIdx.y + if j < block_height and k < block_depth: + temperature[IDX(0,1+j,1+k, block_width, block_height)] = 1; + +@cuda.jit +def rightBoundaryKernel(temperature, block_width, block_height, block_depth): + pass + +@cuda.jit +def topBoundaryKernel(temperature, block_width, block_height, block_depth): + pass + +@cuda.jit +def bottomBoundaryKernel(temperature, block_width, block_height, block_depth): + pass + +@cuda.jit +def frontBoundaryKernel(temperature, block_width, block_height, block_depth): + pass + +@cuda.jit +def backBoundaryKernel(temperature, block_width, block_height, block_depth): + pass + From bc821748d3c3eaf6e0f73dd72a939cbec02179d3 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Tue, 26 Jan 2021 13:12:49 -0500 Subject: [PATCH 052/107] include time as well --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index 5de2cc83..77d62c88 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -2,6 +2,7 @@ from numba import cuda from argparse import ArgumentParser from enum import Enum +import time class Defaults(Enum): GRID_WIDTH = 512, @@ -45,6 +46,7 @@ def main(args): default = Defaults.PRINT_ELEMENTS.value ) args = argp.parse_args() + # charm.exit() # charm.start(main) From a4ba3cf0a73664628a4f7406b68f460678c335e0 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 09:11:48 -0500 Subject: [PATCH 053/107] include boundary, pack kernels --- examples/cuda/gpudirect/jacobi3d/kernels.py | 248 +++++++++++++++++++- 1 file changed, 243 insertions(+), 5 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py index cbf43515..b1aec20b 100644 --- a/examples/cuda/gpudirect/jacobi3d/kernels.py +++ b/examples/cuda/gpudirect/jacobi3d/kernels.py @@ -1,6 +1,17 @@ from numba import cuda from numba.cuda import blockDim, blockIdx, threadIdx +TILE_SIZE_3D = 8 +TILE_SIZE_2D = 16 + +LEFT = 0 +RIGHT = 1 +TOP = 2 +BOTTOM = 3 +FRONT = 4 +BACK = 5 +DIR_COUNT = 6 + @cuda.jit(device=True) def IDX(i,j,k, block_width, block_height): return ((block_width+2)*(block_height+2)*(k)+(block_width+2)*(j)+(i)) @@ -29,21 +40,248 @@ def leftBoundaryKernel(temperature, block_width, block_height, block_depth): @cuda.jit def rightBoundaryKernel(temperature, block_width, block_height, block_depth): - pass + j = blockDim.x*blockIdx.x+threadIdx.x + k = blockDim.y*blockIdx.y+threadIdx.y + if j < block_height and k < block_depth: + temperature[IDX(block_width+1,1+j,1+k, block_width, block_height)] = 1; + @cuda.jit def topBoundaryKernel(temperature, block_width, block_height, block_depth): - pass + i = blockDim.x*blockIdx.x+threadIdx.x + k = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and k < block_depth: + temperature[IDX(1+i,0,1+k, block_width, block_height)] = 1 + @cuda.jit def bottomBoundaryKernel(temperature, block_width, block_height, block_depth): - pass + i = blockDim.x*blockIdx.x+threadIdx.x + k = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and k < block_depth: + temperature[IDX(1+i,block_height+1,1+k, block_width, block_height)] = 1 @cuda.jit def frontBoundaryKernel(temperature, block_width, block_height, block_depth): - pass + i = blockDim.x*blockIdx.x+threadIdx.x + j = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and j < block_height: + temperature[IDX(1+i,1+j,0, block_width, block_height)] = 1; + @cuda.jit def backBoundaryKernel(temperature, block_width, block_height, block_depth): - pass + i = blockDim.x*blockIdx.x+threadIdx.x + j = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and j < block_height: + temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = 1 + +@cuda.jit +def jacobiKernel(temp, new_temp, block_width, block_height, block_depth): + i = (blockDim.x*blockIdx.x+threadIdx.x)+1 + j = (blockDim.y*blockIdx.y+threadIdx.y)+1 + k = (blockDim.z*blockIdx.z+threadIdx.z)+1 + + if (i <= block_width && j <= block_height && k <= block_depth): + new_temperature[IDX(i,j,k, block_width, block_height)] = + (temperature[IDX(i,j,k, block_width, block_height)] + + temperature[IDX(i-1,j,k, block_width, block_height)] + + temperature[IDX(i+1,j,k, block_width, block_height)] + + temperature[IDX(i,j-1,k, block_width, block_height)] + + temperature[IDX(i,j+1,k, block_width, block_height)] + + temperature[IDX(i,j,k-1, block_width, block_height)] + + temperature[IDX(i,j,k+1, block_width, block_height)]) * + 0.142857 # equivalent to dividing by 7 + +@cuda.jit +def leftPackingKernel(temperature, ghost, block_width, block_height, block_depth): + j = blockDim.x*blockIdx.x+threadIdx.x; + k = blockDim.y*blockIdx.y+threadIdx.y; + if j < block_height and k < block_depth: + ghost[block_height*k+j] = + temperature[IDX(1,1+j,1+k, block_width, block_height)] + +@cuda.jit +def rightPackingKernel(temperature, ghost, block_width, block_height, block_depth): + j = blockDim.x*blockIdx.x+threadIdx.x + k = blockDim.y*blockIdx.y+threadIdx.y + if j < block_height and k < block_depth: + ghost[block_height*k+j] = + temperature[IDX(1,1+j,1+k, block_width, block_height)] + } + + +@cuda.jit +def topPackingKernel(temperature, ghost, block_width, block_height, block_depth): + i = blockDim.x*blockIdx.x+threadIdx.x + k = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and k < block_depth: + ghost[block_width*k+i] = + temperature[IDX(1+i,1,1+k, block_width, block_height)] + +@cuda.jit +def bottomPackingKernel(temperature, ghost, block_width, block_height, block_depth): + i = blockDim.x*blockIdx.x+threadIdx.x + k = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and k < block_depth: + ghost[block_width*k+i] = + temperature[IDX(1+i,block_height,1+k, block_width, block_height)]; + } + +@cuda.jit +def frontPackingKernel(temperature, ghost, block_width, block_height, block_depth): + i = blockDim.x*blockIdx.x+threadIdx.x + j = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and j < block_height: + temperature[IDX(1+i,1+j,0, block_width, block_height)] = + ghost[block_width*j+i] + +@cuda.jit +def backPackingKernel(temperature, ghost, block_width, block_height, block_depth): + i = blockDim.x*blockIdx.x+threadIdx.x + j = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and j < block_height: + temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = + ghost[block_width*j+i] + + +@cuda.jit +def leftUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): + j = blockDim.x*blockIdx.x+threadIdx.x + k = blockDim.y*blockIdx.y+threadIdx.y + if j < block_height and k < block_depth: + temperature[IDX(0,1+j,1+k, block_width, block_height)] = ghost[block_height*k+j] + + + +@cuda.jit +def rightUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): + j = blockDim.x*blockIdx.x+threadIdx.x + k = blockDim.y*blockIdx.y+threadIdx.y + if j < block_height and k < block_depth: + temperature[IDX(block_width+1,1+j,1+k, block_width, block_height)] = ghost[block_height*k+j] + +@cuda.jit +def topUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): + i = blockDim.x*blockIdx.x+threadIdx.x + k = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and k < block_depth: + temperature[IDX(1+i,0,1+k, block_width, block_height)] = ghost[block_width*k+i] + +@cuda.jit +def bottomUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): + i = blockDim.x*blockIdx.x+threadIdx.x + k = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and k < block_depth: + temperature[IDX(1+i,block_height+1,1+k, block_width, block_height)] = ghost[block_width*k+i] + +@cuda.jit +def frontUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): + i = blockDim.x*blockIdx.x+threadIdx.x + j = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and j < block_height: + temperature[IDX(1+i,1+j,0, block_width, block_height)] = ghost[block_width*j+i] + +@cuda.jit +def backUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): + i = blockDim.x*blockIdx.x+threadIdx.x + j = blockDim.y*blockIdx.y+threadIdx.y + if i < block_width and j < block_height: + temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = ghost[block_width*j+i] + +def invokeInitKernel(temp_dev_array, block_width, block_height, block_depth, stream): + block_dim = (TILE_SIZE_3D, TILE_SIZE_3D, TILE_SIZE_3D) + grid_dim = (((block_width+2)+(block_dim[0]-1))/block_dim[0], # x + ((block_height+2)+(block_dim[1]-1))/block_dim[1], # y + ((block_depth+2)+(block_dim[2]-1))/block_dim[2]) # z + + initKernel[grid_dim, block_dim, stream](temp_dev_array, + block_width, block_height, + block_depth) + + +def invokeGhostInitKernels(ghosts, ghost_counts, stream): + #TODO: this fn will probably have to change if the ghosts/counts can't + # be transferred automatically + # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#dim3 + block_dim = (256, 1, 1) + dim3 block_dim(256); + for i in range(len(ghosts)): + ghost = ghosts[i] + ghost_count = ghost_counts[i] + grid_dim = (ghost_count+block_dim[0]-1)//block_dim[0], 1, 1) + + ghostInitKernel[grid_dim, block_dim, stream](ghosts, ghost_count) + +def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, bounds, stream): + block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1) + + if bounds(LEFT): + grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + leftBoundaryKernel[grid_dim, block_dim, stream](d_temperature, + block_width, + block_height, + block_depth + ) + if bounds(RIGHT): + grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + rightBoundaryKernel[grid_dim, block_dim, stream](d_temperature, + block_width, + block_height, + block_depth + ) + + if bounds(TOP): + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + topBoundaryKernel[grid_dim, block_dim, stream](d_temperature, + block_width, + block_height, + block_depth + ) + + if bounds(BOTTOM): + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + bottomBoundaryKernel[grid_dim, block_dim, stream](d_temperature, + block_width, + block_height, + block_depth + ) + + if bounds(FRONT): + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_height+(block_dim.y-1))/block_dim.y, 1) + frontBoundaryKernel[grid_dim, block_dim, stream](d_temperature, + block_width, + block_height, + block_depth + ) + + if bounds(BACK): + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_height+(block_dim.y-1))/block_dim.y, 1) + backBoundaryKernel[grid_dim, block_dim, stream](d_temperature, + block_width, + block_height, + block_depth + ) + + +def invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_height, block_depth, stream): + block_dim = (TILE_SIZE_3D, TILE_SIZE_3D, TILE_SIZE_3D) + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_height+(block_dim.y-1))/block_dim.y, + (block_depth+(block_dim.z-1))/block_dim.z) + + jacobiKernel[grid_dim, block_dim, stream](d_temperature, + d_new_temperature, + block_width, + block_height, + block_depth + ) + + From 7652d80c582355faa37006d5820b26698bb6a4f9 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 09:37:34 -0500 Subject: [PATCH 054/107] invoke pack/unpack kernels --- examples/cuda/gpudirect/jacobi3d/kernels.py | 114 ++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py index b1aec20b..213974bd 100644 --- a/examples/cuda/gpudirect/jacobi3d/kernels.py +++ b/examples/cuda/gpudirect/jacobi3d/kernels.py @@ -284,4 +284,118 @@ def invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_heig ) +def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth, stream): + block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1) + + if dir == LEFT: + grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + leftPackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + elif dir == RIGHT: + grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + rightPackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + elif dir == TOP: + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + topPackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + elif dir == BOTTOM: + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + bottomPackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + elif dir == FRONT: + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_height+(block_dim.y-1))/block_dim.y, 1) + frontPackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + elif dir == BACK: + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_height+(block_dim.y-1))/block_dim.y, 1) + backPackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + +def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth, stream): + block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1) + if dir == LEFT: + grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + leftUnpackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + if dir == RIGHT: + grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + rightUnpackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + if dir == TOP: + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + topUnpackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + if dir == BOTTOM: + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_depth+(block_dim.y-1))/block_dim.y, 1) + bottomUnpackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + if dir == FRONT: + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_height+(block_dim.y-1))/block_dim.y, 1) + frontUnpackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) + if dir == BACK: + grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, + (block_height+(block_dim.y-1))/block_dim.y, 1) + backUnpackingKernel[grid_dim, block_dim, stream](d_temperature, + d_ghost, + block_width, + block_height, + block_depth + ) From 8a7253ea66fc2e3d7b87f1f2e51d1a3a8f95dfcf Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 10:26:11 -0500 Subject: [PATCH 055/107] use integer division --- examples/cuda/gpudirect/jacobi3d/kernels.py | 84 ++++++++++----------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py index 213974bd..bf4d159c 100644 --- a/examples/cuda/gpudirect/jacobi3d/kernels.py +++ b/examples/cuda/gpudirect/jacobi3d/kernels.py @@ -191,9 +191,9 @@ def backUnpackingKernel(temperature, ghost, block_width, block_height, block_dep def invokeInitKernel(temp_dev_array, block_width, block_height, block_depth, stream): block_dim = (TILE_SIZE_3D, TILE_SIZE_3D, TILE_SIZE_3D) - grid_dim = (((block_width+2)+(block_dim[0]-1))/block_dim[0], # x - ((block_height+2)+(block_dim[1]-1))/block_dim[1], # y - ((block_depth+2)+(block_dim[2]-1))/block_dim[2]) # z + grid_dim = (((block_width+2)+(block_dim[0]-1))//block_dim[0], # x + ((block_height+2)+(block_dim[1]-1))//block_dim[1], # y + ((block_depth+2)+(block_dim[2]-1))//block_dim[2]) # z initKernel[grid_dim, block_dim, stream](temp_dev_array, block_width, block_height, @@ -217,16 +217,16 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1) if bounds(LEFT): - grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) leftBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, block_depth ) if bounds(RIGHT): - grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) rightBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, @@ -234,8 +234,8 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, ) if bounds(TOP): - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) topBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, @@ -243,8 +243,8 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, ) if bounds(BOTTOM): - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) bottomBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, @@ -252,8 +252,8 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, ) if bounds(FRONT): - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_height+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_height+(block_dim.y-1))//block_dim.y, 1) frontBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, @@ -261,8 +261,8 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, ) if bounds(BACK): - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_height+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_height+(block_dim.y-1))//block_dim.y, 1) backBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, @@ -272,9 +272,9 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, def invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_height, block_depth, stream): block_dim = (TILE_SIZE_3D, TILE_SIZE_3D, TILE_SIZE_3D) - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_height+(block_dim.y-1))/block_dim.y, - (block_depth+(block_dim.z-1))/block_dim.z) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_height+(block_dim.y-1))//block_dim.y, + (block_depth+(block_dim.z-1))//block_dim.z) jacobiKernel[grid_dim, block_dim, stream](d_temperature, d_new_temperature, @@ -288,8 +288,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1) if dir == LEFT: - grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) leftPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -297,8 +297,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth ) elif dir == RIGHT: - grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) rightPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -306,8 +306,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth ) elif dir == TOP: - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) topPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -315,8 +315,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth ) elif dir == BOTTOM: - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) bottomPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -324,8 +324,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth ) elif dir == FRONT: - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_height+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_height+(block_dim.y-1))//block_dim.y, 1) frontPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -333,8 +333,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth ) elif dir == BACK: - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_height+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_height+(block_dim.y-1))//block_dim.y, 1) backPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -346,8 +346,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1) if dir == LEFT: - grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) leftUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -355,8 +355,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_depth ) if dir == RIGHT: - grid_dim = ((block_height+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) rightUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -364,8 +364,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_depth ) if dir == TOP: - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) topUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -373,8 +373,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_depth ) if dir == BOTTOM: - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_depth+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_depth+(block_dim.y-1))//block_dim.y, 1) bottomUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -382,8 +382,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_depth ) if dir == FRONT: - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_height+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_height+(block_dim.y-1))//block_dim.y, 1) frontUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -391,8 +391,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_depth ) if dir == BACK: - grid_dim = ((block_width+(block_dim.x-1))/block_dim.x, - (block_height+(block_dim.y-1))/block_dim.y, 1) + grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, + (block_height+(block_dim.y-1))//block_dim.y, 1) backUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, From 0d614be6030eeb75354f080bbb429f694b58648d Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 10:53:48 -0500 Subject: [PATCH 056/107] add program initialization --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 72 +++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index 77d62c88..1c9ec39a 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -1,7 +1,9 @@ from charm4py import * from numba import cuda +import numpy as np from argparse import ArgumentParser from enum import Enum +from functools import reduce import time class Defaults(Enum): @@ -16,7 +18,7 @@ class Defaults(Enum): def main(args): Defaults.NUM_CHARES = charm.numPes() - argp = ArgumentParser(description = "Jacobi3D implementation in Charm4Py using " + argp = ArgumentParser(description ="Jacobi3D implementation in Charm4Py using " "CUDA and GPU-Direct communication" ) argp.add_argument('-x', '--grid_width', help = "Grid width", @@ -47,7 +49,73 @@ def main(args): ) args = argp.parse_args() - # charm.exit() + num_chares_per_dim = calc_num_chares_per_dim(num_chares, + grid_width, + grid_height, + grid_depth + ) + n_chares_x, n_chares_y, n_chares_z = num_chares_per_dim + + if reduce(lambda x, y: x*y, n_chares_per_dim) != num_chares: + print(f"ERROR: Bad grid of chares: {n_chares_x} x {n_chares_y} x " + f"{n_chares_z} != {num_chares}" + ) + charm.exit(-1) + + # Calculate block size + block_width = grid_width // n_chares_x + block_height = grid_height // n_chares_y + block_depth = grid_depth // n_chares_z + + # Calculate surf count, sizes + x_surf_count = block_height * block_depth + y_surf_count = block_width * block_depth + z_surf_count = block_width * block_height + x_surf_size = x_surf_count * np.dtype(np.float64).itemsize + y_surf_size = y_surf_count * np.dtype(np.float64).itemsize + z_surf_size = z_surf_count * np.dtype(np.float64).itemsize + + + # print configuration + print("\n[CUDA 3D Jacobi example]n") + print(f"Grid: {grid_width} x {grid_height} x {grid_depth}, " + f"Block: {block_width} x {block_height} x {block_depth}, " + f"Chares: {n_chares_x} x {n_chares_y} x {n_chares_z}, " + f"Iterations: {n_iters}, Warm-up: {warmup_iters}, " + f"Zerocopy: {use_zerocopy}, Print: {print_elements}\n\n", + ) + + +def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d): + n_chares = [0, 0, 0] + area = [0.0, 0.0, 0.0] + area[0] = grid_w * grid_h + area[1] = grid_w * grid_d + area[2] = grid_h * grid_d + + bestsurf = 2.0 * sum(area) + + ipx = 1 + + while ipx <= num_chares: + if not num_chares % ipx: + nremain = num_chares // ipx + ipy = 1 + + while ipy <= nremain: + if not nremain % ipy: + ipz = nremain // ipy + surf = area[0] / ipx / ipy + area[1] / ipz + area[2] / ipy / ipz + + if surf < bestsuf: + bestsurf = surf + n_chares[0] = ipx + n_chares[1] = ipy + n_chares[2] = ipz + ipy += 1 + ipx += 1 + + return n_chares # charm.start(main) if __name__ == '__main__': From cf97f37c2515872f56d9c752ecbc6a9871bb748a Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 11:22:51 -0500 Subject: [PATCH 057/107] globals are now broadcast to chares --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 33 +++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index 1c9ec39a..2dc689fe 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -85,6 +85,37 @@ def main(args): f"Zerocopy: {use_zerocopy}, Print: {print_elements}\n\n", ) + charm.thisProxy.updateGlobals({'num_chares': num_chares, + 'grid_width': grid_width, + 'grid_height': grid_height, + 'grid_depth': grid_depth, + 'block_width': block_width, + 'block_height': block_height, + 'block_depth': block_depth, + 'x_surf_count': x_surf_count, + 'y_surf_count': y_surf_count, + 'z_surf_count': z_surf_count, + 'x_surf_size': x_surf_size, + 'y_surf_size': y_surf_size, + 'z_surf_size': z_surf_size, + 'n_chares_x': n_chares_x, + 'n_chares_y': n_chares_y, + 'n_chares_z': n_chares_z, + 'n_iters': n_iters, + 'warmup_iters': warmup_iters, + 'use_zerocopy': use_zerocopy, + 'print_elements': print_elements + }, awaitable = True, module_name = 'block' + ).get() + + init_done_future = Future() + block_proxy = Array(Block, + dims=[n_chares_x, n_chares_y, n_chares_z], + args = init_done_future + ) + init_done_future.get() + charm.exit() + def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d): n_chares = [0, 0, 0] @@ -117,6 +148,6 @@ def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d): return n_chares -# charm.start(main) +# charm.start(main, modules = ['block']) if __name__ == '__main__': main(None) From 9cdd749775fe6522fe6e5454df0908282dac93cb Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 11:42:17 -0500 Subject: [PATCH 058/107] fix Block import, turn args into vars --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index 2dc689fe..b0489b7b 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -4,6 +4,7 @@ from argparse import ArgumentParser from enum import Enum from functools import reduce +from block import Block import time class Defaults(Enum): @@ -47,7 +48,17 @@ def main(args): action = "store_true", default = Defaults.PRINT_ELEMENTS.value ) - args = argp.parse_args() + args, _ = argp.parse_known_args() + + grid_width = args.grid_width + grid_height = args.grid_height + grid_depth = args.grid_depth + num_chares = args.num_chares + iterations = args.iterations + warmup_iterations = args.warmup_iterations + use_zerocopy = args.use_zerocopy + print_blocks = args.print_blocks + num_chares_per_dim = calc_num_chares_per_dim(num_chares, grid_width, @@ -148,6 +159,6 @@ def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d): return n_chares -# charm.start(main, modules = ['block']) -if __name__ == '__main__': - main(None) +charm.start(main, modules = ['block']) +# if __name__ == '__main__': + # main(None) From 123d54fb268542a41d63a11e717e12b30fa14a53 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 11:43:50 -0500 Subject: [PATCH 059/107] enum members no longer tuple --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index b0489b7b..d2fca790 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -8,11 +8,11 @@ import time class Defaults(Enum): - GRID_WIDTH = 512, - GRID_HEIGHT = 512, - GRID_DEPTH = 512, - NUM_ITERS = 512, - WARMUP_ITERS = 10, + GRID_WIDTH = 512 + GRID_HEIGHT = 512 + GRID_DEPTH = 512 + NUM_ITERS = 512 + WARMUP_ITERS = 10 USE_ZEROCOPY = False PRINT_ELEMENTS = False @@ -61,10 +61,10 @@ def main(args): num_chares_per_dim = calc_num_chares_per_dim(num_chares, - grid_width, - grid_height, - grid_depth - ) + grid_width, + grid_height, + grid_depth + ) n_chares_x, n_chares_y, n_chares_z = num_chares_per_dim if reduce(lambda x, y: x*y, n_chares_per_dim) != num_chares: @@ -131,6 +131,7 @@ def main(args): def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d): n_chares = [0, 0, 0] area = [0.0, 0.0, 0.0] + print(grid_w, grid_h, grid_d) area[0] = grid_w * grid_h area[1] = grid_w * grid_d area[2] = grid_h * grid_d From 731c96a96b540dfbb987ce453a84fc8a5513adac Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 11:44:10 -0500 Subject: [PATCH 060/107] enum members no longer tuple --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index d2fca790..11a0d9c7 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -128,7 +128,7 @@ def main(args): charm.exit() -def calc_num_chares_per_dim(num_chares_total, grid_w, grid_h, grid_d): +def calc_num_chares_per_dim(num_chares, grid_w, grid_h, grid_d): n_chares = [0, 0, 0] area = [0.0, 0.0, 0.0] print(grid_w, grid_h, grid_d) From cc079019a5219895dbb354006a8c93fa8653ead0 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 11:47:32 -0500 Subject: [PATCH 061/107] fix some names, print statements --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index 11a0d9c7..2efa842e 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -54,10 +54,10 @@ def main(args): grid_height = args.grid_height grid_depth = args.grid_depth num_chares = args.num_chares - iterations = args.iterations - warmup_iterations = args.warmup_iterations + n_iters = args.iterations + warmup_iters = args.warmup_iterations use_zerocopy = args.use_zerocopy - print_blocks = args.print_blocks + print_elements = args.print_blocks num_chares_per_dim = calc_num_chares_per_dim(num_chares, @@ -67,7 +67,7 @@ def main(args): ) n_chares_x, n_chares_y, n_chares_z = num_chares_per_dim - if reduce(lambda x, y: x*y, n_chares_per_dim) != num_chares: + if reduce(lambda x, y: x*y, num_chares_per_dim) != num_chares: print(f"ERROR: Bad grid of chares: {n_chares_x} x {n_chares_y} x " f"{n_chares_z} != {num_chares}" ) @@ -88,7 +88,7 @@ def main(args): # print configuration - print("\n[CUDA 3D Jacobi example]n") + print("\n[CUDA 3D Jacobi example]\n") print(f"Grid: {grid_width} x {grid_height} x {grid_depth}, " f"Block: {block_width} x {block_height} x {block_depth}, " f"Chares: {n_chares_x} x {n_chares_y} x {n_chares_z}, " @@ -122,7 +122,7 @@ def main(args): init_done_future = Future() block_proxy = Array(Block, dims=[n_chares_x, n_chares_y, n_chares_z], - args = init_done_future + args = [init_done_future] ) init_done_future.get() charm.exit() @@ -131,7 +131,6 @@ def main(args): def calc_num_chares_per_dim(num_chares, grid_w, grid_h, grid_d): n_chares = [0, 0, 0] area = [0.0, 0.0, 0.0] - print(grid_w, grid_h, grid_d) area[0] = grid_w * grid_h area[1] = grid_w * grid_d area[2] = grid_h * grid_d @@ -150,7 +149,7 @@ def calc_num_chares_per_dim(num_chares, grid_w, grid_h, grid_d): ipz = nremain // ipy surf = area[0] / ipx / ipy + area[1] / ipz + area[2] / ipy / ipz - if surf < bestsuf: + if surf < bestsurf: bestsurf = surf n_chares[0] = ipx n_chares[1] = ipy From 29baab2228febe9c1cc3db98bdeb8f71005bd96f Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 12:37:01 -0500 Subject: [PATCH 062/107] add default types for arguments --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index 2efa842e..6e4c4306 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -23,21 +23,27 @@ def main(args): "CUDA and GPU-Direct communication" ) argp.add_argument('-x', '--grid_width', help = "Grid width", + type = int, default = Defaults.GRID_WIDTH.value ) argp.add_argument('-y', '--grid_height', help = "Grid height", + type = int, default = Defaults.GRID_HEIGHT.value ) argp.add_argument('-z', '--grid_depth', help = "Grid depth", + type = int, default = Defaults.GRID_DEPTH.value ) argp.add_argument('-c', '--num_chares', help = "Number of chares", + type = int, default = Defaults.NUM_CHARES ) argp.add_argument('-i', '--iterations', help = "Number of iterations", + type = int, default = Defaults.NUM_ITERS.value ) argp.add_argument('-w', '--warmup_iterations', help = "Number of warmup iterations", + type = int, default = Defaults.WARMUP_ITERS.value ) argp.add_argument('-d', '--use_zerocopy', action = "store_true", @@ -48,6 +54,8 @@ def main(args): action = "store_true", default = Defaults.PRINT_ELEMENTS.value ) + + # only parse the known args because argparse sees the Charm++ args for some reason args, _ = argp.parse_known_args() grid_width = args.grid_width @@ -160,5 +168,3 @@ def calc_num_chares_per_dim(num_chares, grid_w, grid_h, grid_d): return n_chares charm.start(main, modules = ['block']) -# if __name__ == '__main__': - # main(None) From fbd08ccdafd6f64345f255b3a86797bf7d2ec57d Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 12:37:14 -0500 Subject: [PATCH 063/107] fix python syntax errors --- examples/cuda/gpudirect/jacobi3d/kernels.py | 38 +++++++++------------ 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py index bf4d159c..36d64581 100644 --- a/examples/cuda/gpudirect/jacobi3d/kernels.py +++ b/examples/cuda/gpudirect/jacobi3d/kernels.py @@ -82,15 +82,15 @@ def jacobiKernel(temp, new_temp, block_width, block_height, block_depth): j = (blockDim.y*blockIdx.y+threadIdx.y)+1 k = (blockDim.z*blockIdx.z+threadIdx.z)+1 - if (i <= block_width && j <= block_height && k <= block_depth): - new_temperature[IDX(i,j,k, block_width, block_height)] = - (temperature[IDX(i,j,k, block_width, block_height)] + - temperature[IDX(i-1,j,k, block_width, block_height)] + - temperature[IDX(i+1,j,k, block_width, block_height)] + - temperature[IDX(i,j-1,k, block_width, block_height)] + - temperature[IDX(i,j+1,k, block_width, block_height)] + - temperature[IDX(i,j,k-1, block_width, block_height)] + - temperature[IDX(i,j,k+1, block_width, block_height)]) * + if (i <= block_width and j <= block_height and k <= block_depth): + new_temperature[IDX(i,j,k, block_width, block_height)] = \ + (temperature[IDX(i,j,k, block_width, block_height)] + \ + temperature[IDX(i-1,j,k, block_width, block_height)] + \ + temperature[IDX(i+1,j,k, block_width, block_height)] + \ + temperature[IDX(i,j-1,k, block_width, block_height)] + \ + temperature[IDX(i,j+1,k, block_width, block_height)] + \ + temperature[IDX(i,j,k-1, block_width, block_height)] + \ + temperature[IDX(i,j,k+1, block_width, block_height)]) * \ 0.142857 # equivalent to dividing by 7 @cuda.jit @@ -98,7 +98,7 @@ def leftPackingKernel(temperature, ghost, block_width, block_height, block_depth j = blockDim.x*blockIdx.x+threadIdx.x; k = blockDim.y*blockIdx.y+threadIdx.y; if j < block_height and k < block_depth: - ghost[block_height*k+j] = + ghost[block_height*k+j] = \ temperature[IDX(1,1+j,1+k, block_width, block_height)] @cuda.jit @@ -106,17 +106,15 @@ def rightPackingKernel(temperature, ghost, block_width, block_height, block_dept j = blockDim.x*blockIdx.x+threadIdx.x k = blockDim.y*blockIdx.y+threadIdx.y if j < block_height and k < block_depth: - ghost[block_height*k+j] = + ghost[block_height*k+j] = \ temperature[IDX(1,1+j,1+k, block_width, block_height)] - } - @cuda.jit def topPackingKernel(temperature, ghost, block_width, block_height, block_depth): i = blockDim.x*blockIdx.x+threadIdx.x k = blockDim.y*blockIdx.y+threadIdx.y if i < block_width and k < block_depth: - ghost[block_width*k+i] = + ghost[block_width*k+i] = \ temperature[IDX(1+i,1,1+k, block_width, block_height)] @cuda.jit @@ -124,16 +122,15 @@ def bottomPackingKernel(temperature, ghost, block_width, block_height, block_dep i = blockDim.x*blockIdx.x+threadIdx.x k = blockDim.y*blockIdx.y+threadIdx.y if i < block_width and k < block_depth: - ghost[block_width*k+i] = + ghost[block_width*k+i] = \ temperature[IDX(1+i,block_height,1+k, block_width, block_height)]; - } @cuda.jit def frontPackingKernel(temperature, ghost, block_width, block_height, block_depth): i = blockDim.x*blockIdx.x+threadIdx.x j = blockDim.y*blockIdx.y+threadIdx.y if i < block_width and j < block_height: - temperature[IDX(1+i,1+j,0, block_width, block_height)] = + temperature[IDX(1+i,1+j,0, block_width, block_height)] = \ ghost[block_width*j+i] @cuda.jit @@ -141,8 +138,8 @@ def backPackingKernel(temperature, ghost, block_width, block_height, block_depth i = blockDim.x*blockIdx.x+threadIdx.x j = blockDim.y*blockIdx.y+threadIdx.y if i < block_width and j < block_height: - temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = - ghost[block_width*j+i] + temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = \ + ghost[block_width*j+i] @cuda.jit @@ -205,11 +202,10 @@ def invokeGhostInitKernels(ghosts, ghost_counts, stream): # be transferred automatically # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#dim3 block_dim = (256, 1, 1) - dim3 block_dim(256); for i in range(len(ghosts)): ghost = ghosts[i] ghost_count = ghost_counts[i] - grid_dim = (ghost_count+block_dim[0]-1)//block_dim[0], 1, 1) + grid_dim = ((ghost_count+block_dim[0]-1)//block_dim[0], 1, 1) ghostInitKernel[grid_dim, block_dim, stream](ghosts, ghost_count) From b5d7bff914b5d52563c5323cc9bcc349718bac2f Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 12:37:33 -0500 Subject: [PATCH 064/107] include chare file --- examples/cuda/gpudirect/jacobi3d/block.py | 54 +++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 examples/cuda/gpudirect/jacobi3d/block.py diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py new file mode 100644 index 00000000..24839058 --- /dev/null +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -0,0 +1,54 @@ +from charm4py import * +import kernels + +class Block(Chare): + def __init__(self, init_done_future): + self.my_iter = 0 + self.neighbors = 0 + self.remote_count = 0 + self.x = self.thisIndex[0] + self.y = self.thisIndex[1] + self.z = self.thisIndex[2] + + self.bounds = [False] * kernels.DIR_COUNT + self.init_bounds(self.x, self.y, self.z) + + self.h_ghosts = [] + self.d_ghosts = [] + self.d_send_ghosts = [] + self.d_recv_ghosts = [] + self.d_ghosts_addr = [] + self.d_send_ghosts_addr = [] + self.d_recv_ghosts_addr = [] + + self.reduce(init_done_future) + + def init_bounds(self, x, y, z): + neighbors = 0 + + if x == 0: + self.bounds[kernels.LEFT] = True + else: + neighbors += 1 + if x == n_chares_x - 1: + self.bounds[kernels.RIGHT] = True + else: + neighbors += 1 + if y == 0: + self.bounds[kernels.TOP] = True + else: + neighbors += 1 + if y == n_chares_y - 1: + self.bounds[kernels.BOTTOM] = True + else: + neighbors += 1 + if z == 0: + self.bounds[kernels.FRONT] = True + else: + neighbors += 1 + if z == n_chares_z - 1: + self.bounds[kernels.BACK] = True + else: + neighbors += 1 + + self.neighbors = neighbors From e4338c824245d074f2e7660cd5c0ad1d913be194 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 14:26:54 -0500 Subject: [PATCH 065/107] fix spelling, corrected tuple access --- examples/cuda/gpudirect/jacobi3d/kernels.py | 181 ++++++++++---------- 1 file changed, 90 insertions(+), 91 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py index 36d64581..823e4d44 100644 --- a/examples/cuda/gpudirect/jacobi3d/kernels.py +++ b/examples/cuda/gpudirect/jacobi3d/kernels.py @@ -1,5 +1,4 @@ from numba import cuda -from numba.cuda import blockDim, blockIdx, threadIdx TILE_SIZE_3D = 8 TILE_SIZE_2D = 16 @@ -18,69 +17,69 @@ def IDX(i,j,k, block_width, block_height): @cuda.jit def initKernel(temperature, block_width, block_height, block_depth): - i = blockDim.x * blockIdx.x + threadIdx.x - j = blockDim.y * blockIdx.y + threadIdx.y - k = blockDim.z * blockIdx.z + threadIdx.z + i = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x + j = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y + k = cuda.blockDim.z * cuda.blockIdx.z + cuda.threadIdx.z if i < block_width + 2 and j < block_height + 2 and k < block_depth + 2: temperature[IDX(i, j, k, block_width, block_height)] = 0 @cuda.jit def ghostInitKernel(ghost, ghost_count): - i = blockDim.x * blockIdx.x + threadIdx.x + i = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x if i < ghost_count: ghost[i] = 0 @cuda.jit def leftBoundaryKernel(temperature, block_width, block_height, block_depth): - j = blockDim.x * blockIdx.x + threadIdx.x - k = blockDim.y * blockIdx.y + threadIdx.y + j = cuda.blockDim.x * cuda.blockIdx.x + cuda.threadIdx.x + k = cuda.blockDim.y * cuda.blockIdx.y + cuda.threadIdx.y if j < block_height and k < block_depth: temperature[IDX(0,1+j,1+k, block_width, block_height)] = 1; @cuda.jit def rightBoundaryKernel(temperature, block_width, block_height, block_depth): - j = blockDim.x*blockIdx.x+threadIdx.x - k = blockDim.y*blockIdx.y+threadIdx.y + j = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if j < block_height and k < block_depth: temperature[IDX(block_width+1,1+j,1+k, block_width, block_height)] = 1; @cuda.jit def topBoundaryKernel(temperature, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - k = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and k < block_depth: temperature[IDX(1+i,0,1+k, block_width, block_height)] = 1 @cuda.jit def bottomBoundaryKernel(temperature, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - k = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and k < block_depth: temperature[IDX(1+i,block_height+1,1+k, block_width, block_height)] = 1 @cuda.jit def frontBoundaryKernel(temperature, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - j = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and j < block_height: temperature[IDX(1+i,1+j,0, block_width, block_height)] = 1; @cuda.jit def backBoundaryKernel(temperature, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - j = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and j < block_height: temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = 1 @cuda.jit def jacobiKernel(temp, new_temp, block_width, block_height, block_depth): - i = (blockDim.x*blockIdx.x+threadIdx.x)+1 - j = (blockDim.y*blockIdx.y+threadIdx.y)+1 - k = (blockDim.z*blockIdx.z+threadIdx.z)+1 + i = (cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x)+1 + j = (cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y)+1 + k = (cuda.blockDim.z*cuda.blockIdx.z+cuda.threadIdx.z)+1 if (i <= block_width and j <= block_height and k <= block_depth): new_temperature[IDX(i,j,k, block_width, block_height)] = \ @@ -95,48 +94,48 @@ def jacobiKernel(temp, new_temp, block_width, block_height, block_depth): @cuda.jit def leftPackingKernel(temperature, ghost, block_width, block_height, block_depth): - j = blockDim.x*blockIdx.x+threadIdx.x; - k = blockDim.y*blockIdx.y+threadIdx.y; + j = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x; + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y; if j < block_height and k < block_depth: ghost[block_height*k+j] = \ temperature[IDX(1,1+j,1+k, block_width, block_height)] @cuda.jit def rightPackingKernel(temperature, ghost, block_width, block_height, block_depth): - j = blockDim.x*blockIdx.x+threadIdx.x - k = blockDim.y*blockIdx.y+threadIdx.y + j = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if j < block_height and k < block_depth: ghost[block_height*k+j] = \ temperature[IDX(1,1+j,1+k, block_width, block_height)] @cuda.jit def topPackingKernel(temperature, ghost, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - k = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and k < block_depth: ghost[block_width*k+i] = \ temperature[IDX(1+i,1,1+k, block_width, block_height)] @cuda.jit def bottomPackingKernel(temperature, ghost, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - k = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and k < block_depth: ghost[block_width*k+i] = \ temperature[IDX(1+i,block_height,1+k, block_width, block_height)]; @cuda.jit def frontPackingKernel(temperature, ghost, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - j = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and j < block_height: temperature[IDX(1+i,1+j,0, block_width, block_height)] = \ ghost[block_width*j+i] @cuda.jit def backPackingKernel(temperature, ghost, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - j = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and j < block_height: temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = \ ghost[block_width*j+i] @@ -144,8 +143,8 @@ def backPackingKernel(temperature, ghost, block_width, block_height, block_depth @cuda.jit def leftUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): - j = blockDim.x*blockIdx.x+threadIdx.x - k = blockDim.y*blockIdx.y+threadIdx.y + j = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if j < block_height and k < block_depth: temperature[IDX(0,1+j,1+k, block_width, block_height)] = ghost[block_height*k+j] @@ -153,36 +152,36 @@ def leftUnpackingKernel(temperature, ghost, block_width, block_height, block_dep @cuda.jit def rightUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): - j = blockDim.x*blockIdx.x+threadIdx.x - k = blockDim.y*blockIdx.y+threadIdx.y + j = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if j < block_height and k < block_depth: temperature[IDX(block_width+1,1+j,1+k, block_width, block_height)] = ghost[block_height*k+j] @cuda.jit def topUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - k = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and k < block_depth: temperature[IDX(1+i,0,1+k, block_width, block_height)] = ghost[block_width*k+i] @cuda.jit def bottomUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - k = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + k = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and k < block_depth: temperature[IDX(1+i,block_height+1,1+k, block_width, block_height)] = ghost[block_width*k+i] @cuda.jit def frontUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - j = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and j < block_height: temperature[IDX(1+i,1+j,0, block_width, block_height)] = ghost[block_width*j+i] @cuda.jit def backUnpackingKernel(temperature, ghost, block_width, block_height, block_depth): - i = blockDim.x*blockIdx.x+threadIdx.x - j = blockDim.y*blockIdx.y+threadIdx.y + i = cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x + j = cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y if i < block_width and j < block_height: temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = ghost[block_width*j+i] @@ -207,58 +206,58 @@ def invokeGhostInitKernels(ghosts, ghost_counts, stream): ghost_count = ghost_counts[i] grid_dim = ((ghost_count+block_dim[0]-1)//block_dim[0], 1, 1) - ghostInitKernel[grid_dim, block_dim, stream](ghosts, ghost_count) + ghostInitKernel[grid_dim, block_dim, stream](ghost, ghost_count) def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, bounds, stream): block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1) - if bounds(LEFT): - grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + if bounds[LEFT]: + grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) leftBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, block_depth ) - if bounds(RIGHT): - grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + if bounds[RIGHT]: + grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) rightBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, block_depth ) - if bounds(TOP): - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + if bounds[TOP]: + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) topBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, block_depth ) - if bounds(BOTTOM): - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + if bounds[BOTTOM]: + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) bottomBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, block_depth ) - if bounds(FRONT): - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_height+(block_dim.y-1))//block_dim.y, 1) + if bounds[FRONT]: + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_height+(block_dim[1]-1))//block_dim[1], 1) frontBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, block_depth ) - if bounds(BACK): - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_height+(block_dim.y-1))//block_dim.y, 1) + if bounds[BACK]: + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_height+(block_dim[1]-1))//block_dim[1], 1) backBoundaryKernel[grid_dim, block_dim, stream](d_temperature, block_width, block_height, @@ -268,9 +267,9 @@ def invokeBoundaryKernels(d_temperature, block_width, block_height, block_depth, def invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_height, block_depth, stream): block_dim = (TILE_SIZE_3D, TILE_SIZE_3D, TILE_SIZE_3D) - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_height+(block_dim.y-1))//block_dim.y, - (block_depth+(block_dim.z-1))//block_dim.z) + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_height+(block_dim[1]-1))//block_dim[1], + (block_depth+(block_dim[2]-1))//block_dim[2]) jacobiKernel[grid_dim, block_dim, stream](d_temperature, d_new_temperature, @@ -280,12 +279,12 @@ def invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_heig ) -def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth, stream): +def invokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth, stream): block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1) if dir == LEFT: - grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) leftPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -293,8 +292,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth ) elif dir == RIGHT: - grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) rightPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -302,8 +301,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth ) elif dir == TOP: - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) topPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -311,8 +310,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth ) elif dir == BOTTOM: - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) bottomPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -320,8 +319,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth ) elif dir == FRONT: - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_height+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_height+(block_dim[1]-1))//block_dim[1], 1) frontPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -329,8 +328,8 @@ def inbokePackingKernel(d_temperature, d_ghost, dir, block_width, block_height, block_depth ) elif dir == BACK: - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_height+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_height+(block_dim[1]-1))//block_dim[1], 1) backPackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -342,8 +341,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_dim = (TILE_SIZE_2D, TILE_SIZE_2D, 1) if dir == LEFT: - grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) leftUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -351,8 +350,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_depth ) if dir == RIGHT: - grid_dim = ((block_height+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_height+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) rightUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -360,8 +359,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_depth ) if dir == TOP: - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) topUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -369,8 +368,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_depth ) if dir == BOTTOM: - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_depth+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_depth+(block_dim[1]-1))//block_dim[1], 1) bottomUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -378,8 +377,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_depth ) if dir == FRONT: - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_height+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_height+(block_dim[1]-1))//block_dim[1], 1) frontUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, @@ -387,8 +386,8 @@ def invokeUnpackingKernel(d_temperature, d_ghost, dir, block_width, block_height block_depth ) if dir == BACK: - grid_dim = ((block_width+(block_dim.x-1))//block_dim.x, - (block_height+(block_dim.y-1))//block_dim.y, 1) + grid_dim = ((block_width+(block_dim[0]-1))//block_dim[0], + (block_height+(block_dim[1]-1))//block_dim[1], 1) backUnpackingKernel[grid_dim, block_dim, stream](d_temperature, d_ghost, block_width, From 6e076606a8f74b0d9f2eabb71b76347e129515f7 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 14:27:30 -0500 Subject: [PATCH 066/107] add chare initialization --- examples/cuda/gpudirect/jacobi3d/block.py | 129 ++++++++++++++++++++-- 1 file changed, 121 insertions(+), 8 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py index 24839058..476537ae 100644 --- a/examples/cuda/gpudirect/jacobi3d/block.py +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -1,6 +1,18 @@ from charm4py import * +import array +from numba import cuda +import numpy as np import kernels +def getArrayAddress(arr): + return arr.__cuda__array_interface__['data'][0] + +def getArraySize(arr): + return arr.nbytes + +def getArrayData(arr): + return (getArrayAddress(arr), getArraySize(arr)) + class Block(Chare): def __init__(self, init_done_future): self.my_iter = 0 @@ -9,20 +21,121 @@ def __init__(self, init_done_future): self.x = self.thisIndex[0] self.y = self.thisIndex[1] self.z = self.thisIndex[2] + self.ghost_sizes = (x_surf_size, x_surf_size, + y_surf_size, y_surf_size, + z_surf_size, z_surf_size + ) + + self.ghost_counts = (x_surf_count, x_surf_count, + y_surf_count, y_surf_count, + z_surf_count, z_surf_count + ) self.bounds = [False] * kernels.DIR_COUNT - self.init_bounds(self.x, self.y, self.z) - self.h_ghosts = [] - self.d_ghosts = [] - self.d_send_ghosts = [] - self.d_recv_ghosts = [] - self.d_ghosts_addr = [] - self.d_send_ghosts_addr = [] - self.d_recv_ghosts_addr = [] + empty = lambda x: [0] * x + + self.h_temperature = None + self.d_temperature = None + self.d_new_temperature = None + self.h_ghosts = empty(kernels.DIR_COUNT) + self.d_ghosts = empty(kernels.DIR_COUNT) + self.d_send_ghosts = empty(kernels.DIR_COUNT) + self.d_recv_ghosts = empty(kernels.DIR_COUNT) + self.d_ghosts_addr = empty(kernels.DIR_COUNT) + self.d_send_ghosts_addr = empty(kernels.DIR_COUNT) + self.d_recv_ghosts_addr = empty(kernels.DIR_COUNT) + self.d_send_ghosts_size = empty(kernels.DIR_COUNT) + self.d_recv_ghotss_size = empty(kernels.DIR_COUNT) + + self.stream = cuda.default_stream() + + self.init() self.reduce(init_done_future) + def init(self): + self.init_bounds(self.x, self.y, self.z) + self.init_device_data() + + def init_device_data(self): + temp_size = (block_width+2) * (block_height+2) * (block_depth+2) + self.h_temperature = cuda.pinned_array(temp_size, dtype=np.float64) + self.d_temperature = cuda.device_array(temp_size, dtype=np.float64) + self.d_new_temperature = cuda.device_array(temp_size, dtype=np.float64) + + if use_zerocopy: + for i in range(kernels.DIR_COUNT): + self.d_send_ghosts[i] = cuda.device_array(self.ghost_sizes[i], + dtype=np.float64 + ) + self.d_recv_ghosts[i] = cuda.device_array(self.ghost_sizes[i], + dtype=np.float64 + ) + + d_send_data = getArrayData(d_send_ghosts) + d_recv_data = getArrayData(d_send_ghosts) + + d_send_addr = array.array('L', [d_send_data[0]]) + d_recv_addr = array.array('L', [d_recv_data[0]]) + + d_send_size = array.array('L', [d_send_data[1]]) + d_recv_size = array.array('L', [d_recv_data[1]]) + + self.d_send_ghosts_addr[i] = d_send_addr + self.d_recv_ghosts_addr[i] = d_recv_addr + + self.d_send_ghosts_size[i] = d_send_size + self.d_recv_ghosts_size[i] = d_recv_size + else: + for i in range(kernels.DIR_COUNT): + self.h_ghosts[i] = cuda.pinned_array(self.ghost_sizes[i], + dtype=np.float64 + ) + self.d_ghosts[i] = cuda.device_array(self.ghost_sizes[i], + dtype=np.float64 + ) + + kernels.invokeInitKernel(self.d_temperature, block_width, block_height, block_depth, + self.stream + ) + kernels.invokeInitKernel(self.d_new_temperature, block_width, block_height, block_depth, + self.stream + ) + if use_zerocopy: + kernels.invokeGhostInitKernels(self.d_send_ghosts, + self.ghost_counts, + self.stream + ) + kernels.invokeGhostInitKernels(self.d_recv_ghosts, + self.ghost_counts, + self.stream + ) + else: + kernels.invokeGhostInitKernels(self.d_ghosts, + self.ghost_counts, + self.stream + ) + for i in range(kernels.DIR_COUNT): + self.h_ghosts[i].fill(0) + + kernels.invokeBoundaryKernels(self.d_temperature, + block_width, + block_height, + block_depth, + self.bounds, + self.stream + ) + kernels.invokeBoundaryKernels(self.d_new_temperature, + block_width, + block_height, + block_depth, + self.bounds, + self.stream + ) + self.stream.synchronize() + + def init_bounds(self, x, y, z): neighbors = 0 From b2d9c6e99c70d7bf6b6700117c80d58a7efb06c9 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 14:40:19 -0500 Subject: [PATCH 067/107] add neighbor channel initialization --- examples/cuda/gpudirect/jacobi3d/block.py | 31 +++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py index 476537ae..86c3d359 100644 --- a/examples/cuda/gpudirect/jacobi3d/block.py +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -21,6 +21,7 @@ def __init__(self, init_done_future): self.x = self.thisIndex[0] self.y = self.thisIndex[1] self.z = self.thisIndex[2] + self.ghost_sizes = (x_surf_size, x_surf_size, y_surf_size, y_surf_size, z_surf_size, z_surf_size @@ -35,6 +36,8 @@ def __init__(self, init_done_future): empty = lambda x: [0] * x + self.neighbor_channels = empty(kernels.DIR_COUNT) + self.h_temperature = None self.d_temperature = None self.d_new_temperature = None @@ -57,6 +60,34 @@ def __init__(self, init_done_future): def init(self): self.init_bounds(self.x, self.y, self.z) self.init_device_data() + self.init_neighbor_channels() + + def init_neighbor_channels(self): + n_channels = self.neighbors + + if not self.bounds[kernels.LEFT]: + new_c = Channel(self, self.thisProxy[(self.x-1, self.y, self.z)]) + self.neighbor_channels[kernels.LEFT] = new_c + + if not self.bounds[kernels.RIGHT]: + new_c = Channel(self, self.thisProxy[(self.x+1, self.y, self.z)]) + self.neighbor_channels[kernels.RIGHT] = new_c + + if not self.bounds[kernels.TOP]: + new_c = Channel(self, self.thisProxy[(self.x, self.y-1, self.z)]) + self.neighbor_channels[kernels.TOP] = new_c + + if not self.bounds[kernels.BOTTOM]: + new_c = Channel(self, self.thisProxy[(self.x, self.y+1, self.z)]) + self.neighbor_channels[kernels.BOTTOM] = new_c + + if not self.bounds[kernels.FRONT]: + new_c = Channel(self, self.thisProxy[(self.x, self.y, self.z-1)]) + self.neighbor_channels[kernels.FRONT] = new_c + + if not self.bounds[kernels.BACK]: + new_c = Channel(self, self.thisProxy[(self.x, self.y, self.z+1)]) + self.neighbor_channels[kernels.BACK] = new_c def init_device_data(self): temp_size = (block_width+2) * (block_height+2) * (block_depth+2) From 457a9fa249bfc7378366e2b7a710ebfeab79f0ec Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 16:14:23 -0500 Subject: [PATCH 068/107] finish impl --- examples/cuda/gpudirect/jacobi3d/block.py | 131 +++++++++++++++++++++- 1 file changed, 127 insertions(+), 4 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py index 86c3d359..9a87e350 100644 --- a/examples/cuda/gpudirect/jacobi3d/block.py +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -2,10 +2,11 @@ import array from numba import cuda import numpy as np +import time import kernels def getArrayAddress(arr): - return arr.__cuda__array_interface__['data'][0] + return arr.__cuda_array_interface__['data'][0] def getArraySize(arr): return arr.nbytes @@ -37,6 +38,7 @@ def __init__(self, init_done_future): empty = lambda x: [0] * x self.neighbor_channels = empty(kernels.DIR_COUNT) + self.acive_neighbor_channels = None self.h_temperature = None self.d_temperature = None @@ -49,7 +51,7 @@ def __init__(self, init_done_future): self.d_send_ghosts_addr = empty(kernels.DIR_COUNT) self.d_recv_ghosts_addr = empty(kernels.DIR_COUNT) self.d_send_ghosts_size = empty(kernels.DIR_COUNT) - self.d_recv_ghotss_size = empty(kernels.DIR_COUNT) + self.d_recv_ghosts_size = empty(kernels.DIR_COUNT) self.stream = cuda.default_stream() @@ -64,30 +66,46 @@ def init(self): def init_neighbor_channels(self): n_channels = self.neighbors + active_neighbors = [] if not self.bounds[kernels.LEFT]: new_c = Channel(self, self.thisProxy[(self.x-1, self.y, self.z)]) self.neighbor_channels[kernels.LEFT] = new_c + # NOTE: we are adding the member 'recv_direction' to this channel!!! + new_c.recv_direction = kernels.LEFT + active_neighbors.append(new_c) if not self.bounds[kernels.RIGHT]: new_c = Channel(self, self.thisProxy[(self.x+1, self.y, self.z)]) self.neighbor_channels[kernels.RIGHT] = new_c + new_c.recv_direction = kernels.RIGHT + active_neighbors.append(new_c) if not self.bounds[kernels.TOP]: new_c = Channel(self, self.thisProxy[(self.x, self.y-1, self.z)]) self.neighbor_channels[kernels.TOP] = new_c + new_c.recv_direction = kernels.TOP + active_neighbors.append(new_c) if not self.bounds[kernels.BOTTOM]: new_c = Channel(self, self.thisProxy[(self.x, self.y+1, self.z)]) self.neighbor_channels[kernels.BOTTOM] = new_c + new_c.recv_direction = kernels.BOTTOM + active_neighbors.append(new_c) if not self.bounds[kernels.FRONT]: new_c = Channel(self, self.thisProxy[(self.x, self.y, self.z-1)]) self.neighbor_channels[kernels.FRONT] = new_c + new_c.recv_direction = kernels.FRONT + active_neighbors.append(new_c) if not self.bounds[kernels.BACK]: new_c = Channel(self, self.thisProxy[(self.x, self.y, self.z+1)]) self.neighbor_channels[kernels.BACK] = new_c + new_c.recv_direction = kernels.BACK + active_neighbors.append(new_c) + + self.active_neighbor_channels = active_neighbors def init_device_data(self): temp_size = (block_width+2) * (block_height+2) * (block_depth+2) @@ -104,8 +122,8 @@ def init_device_data(self): dtype=np.float64 ) - d_send_data = getArrayData(d_send_ghosts) - d_recv_data = getArrayData(d_send_ghosts) + d_send_data = getArrayData(self.d_send_ghosts[i]) + d_recv_data = getArrayData(self.d_send_ghosts[i]) d_send_addr = array.array('L', [d_send_data[0]]) d_recv_addr = array.array('L', [d_recv_data[0]]) @@ -196,3 +214,108 @@ def init_bounds(self, x, y, z): neighbors += 1 self.neighbors = neighbors + + + @coro + def sendGhosts(self): + for dir in range(kernels.DIR_COUNT): + if not self.bounds[dir]: + self.sendGhost(dir) + + def updateAndPack(self): + kernels.invokeJacobiKernel(self.d_temperature, + self.d_new_temperature, + block_width, + block_height, + block_depth, + self.stream + ) + + for i in range(kernels.DIR_COUNT): + if not self.bounds[i]: + ghosts = self.d_send_ghosts[i] if use_zerocopy else self.d_ghosts[i] + + kernels.invokePackingKernel(self.d_temperature, + ghosts, + i, + block_width, + block_height, + block_depth, + self.stream + ) + if not use_zerocopy: + # TODO: change this to the CUDA hooks in charmlib + self.d_ghosts[i].copy_to_host(self.h_ghosts[i]) + self.stream.synchronize() + + + @coro + def sendGhost(self, direction): + send_ch = self.neighbor_channels[direction] + + if use_zerocopy: + send_ch.send(gpu_src_ptrs = self.d_send_ghosts_addr[direction], + gpu_src_sizes = self.d_send_ghosts_size[direction] + ) + else: + send_ch.send(self.h_ghosts[direction]) + + @coro + def recvGhosts(self): + for ch in charm.iwait(self.active_neighbor_channels): + # remember: we set 'recv_direction' member + # directly in the initialization phase + neighbor_idx = ch.recv_direction + + if use_zerocopy: + ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[neighbor_idx], + post_buf_sizes = self.d_recv_ghosts_size[neighbor_idx] + ) + else: + self.h_ghosts[neighbor_idx] = ch.recv() + self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx], + stream=self.stream + ) + + kernels.invokeUnpackingKernel(self.d_temperature, + self.d_ghosts[neighbor_idx], + ch.recv_direction, + block_width, + block_height, + block_depth, + self.stream + ) + self.stream.synchronize() + + @coro + def exchangeGhosts(self): + self.d_temperature, self.d_new_temperature = \ + self.d_new_temperature, self.d_temperature + + self.sendGhosts() + self.recvGhosts() + + @coro + def run(self, done_future): + tstart = time.time() + comm_time = 0 + for current_iter in range(n_iters + warmup_iters): + if current_iter == warmup_iters: + tstart = time.time() + + self.my_iter = current_iter + self.updateAndPack() + + comm_start_time = time.time() + + self.exchangeGhosts() + + if current_iter >= warmup_iters: + comm_time += time.time() - comm_start_time + + + tend = time.time() + + if self.thisIndex == (0, 0, 0): + print(f'Elapsed time: {tend-tstart}') + self.reduce(done_future) From a65c2dd73f0be23aa733da34a71e1f0af10c453e Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 16:14:35 -0500 Subject: [PATCH 069/107] chares now run --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index 6e4c4306..2160ed7e 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -133,6 +133,10 @@ def main(args): args = [init_done_future] ) init_done_future.get() + + run_future = Future() + block_proxy.run(run_future) + run_future.get() charm.exit() From e5c6aa09695da53e3c6340ce92fb9d9ca9c68486 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 16:14:49 -0500 Subject: [PATCH 070/107] fix mispelled var --- examples/cuda/gpudirect/jacobi3d/kernels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cuda/gpudirect/jacobi3d/kernels.py b/examples/cuda/gpudirect/jacobi3d/kernels.py index 823e4d44..ce489757 100644 --- a/examples/cuda/gpudirect/jacobi3d/kernels.py +++ b/examples/cuda/gpudirect/jacobi3d/kernels.py @@ -76,7 +76,7 @@ def backBoundaryKernel(temperature, block_width, block_height, block_depth): temperature[IDX(1+i,1+j,block_depth+1, block_width, block_height)] = 1 @cuda.jit -def jacobiKernel(temp, new_temp, block_width, block_height, block_depth): +def jacobiKernel(temperature, new_temperature, block_width, block_height, block_depth): i = (cuda.blockDim.x*cuda.blockIdx.x+cuda.threadIdx.x)+1 j = (cuda.blockDim.y*cuda.blockIdx.y+cuda.threadIdx.y)+1 k = (cuda.blockDim.z*cuda.blockIdx.z+cuda.threadIdx.z)+1 From a4dbe3c2f556b55da91d6367005e97621be84b8d Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 20:02:14 -0500 Subject: [PATCH 071/107] correct ghost info now received, output runtime info --- examples/cuda/gpudirect/jacobi3d/block.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py index 9a87e350..0c032a8c 100644 --- a/examples/cuda/gpudirect/jacobi3d/block.py +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -248,7 +248,6 @@ def updateAndPack(self): self.d_ghosts[i].copy_to_host(self.h_ghosts[i]) self.stream.synchronize() - @coro def sendGhost(self, direction): send_ch = self.neighbor_channels[direction] @@ -271,14 +270,16 @@ def recvGhosts(self): ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[neighbor_idx], post_buf_sizes = self.d_recv_ghosts_size[neighbor_idx] ) + recv_ghost = self.d_recv_ghosts[neighbor_idx] else: self.h_ghosts[neighbor_idx] = ch.recv() self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx], stream=self.stream ) + recv_ghost = self.d_ghosts[neighbor_idx] kernels.invokeUnpackingKernel(self.d_temperature, - self.d_ghosts[neighbor_idx], + recv_ghost, ch.recv_direction, block_width, block_height, @@ -317,5 +318,7 @@ def run(self, done_future): tend = time.time() if self.thisIndex == (0, 0, 0): - print(f'Elapsed time: {tend-tstart}') + elapsed_time = tend-tstart + print(f'Elapsed time: {round(elapsed_time,5)}') + print(f'Approximate time per iteration: {round(((elapsed_time/n_iters)*1e6),5)}') self.reduce(done_future) From b8eca90ed753fede2b43977db36155801d467732 Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Wed, 27 Jan 2021 21:01:22 -0500 Subject: [PATCH 072/107] Fix wrong assertion --- charm4py/channel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index e187f19f..4584c618 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -80,7 +80,7 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None, elif post_buf_addresses != None: gpu_recv_bufs = ret # ret = ret[:-1:1] - assert len(post_buffers) == len(gpu_recv_bufs) + assert len(post_buf_addresses) == len(gpu_recv_bufs) assert post_buf_sizes recv_future = charm.getGPUDirectDataFromAddresses(post_buf_addresses, post_buf_sizes, gpu_recv_bufs, stream_ptrs) recv_future.get() From 60a250cda558413baf65ae39aade5bdbf3492ae7 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 21:36:13 -0500 Subject: [PATCH 073/107] use charmlib hooks for copying --- examples/cuda/gpudirect/jacobi3d/block.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py index 0c032a8c..c2af0b5d 100644 --- a/examples/cuda/gpudirect/jacobi3d/block.py +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -244,8 +244,12 @@ def updateAndPack(self): self.stream ) if not use_zerocopy: - # TODO: change this to the CUDA hooks in charmlib - self.d_ghosts[i].copy_to_host(self.h_ghosts[i]) + # self.d_ghosts[i].copy_to_host(self.h_ghosts[i]) + charm.lib.CudaDtoH(self.h_ghosts[i].__array_interface__['data'][0], + self.d_ghosts[i].__cuda_array_interface__['data'][0], + self.d_ghosts[i].nbytes, 0 + ) + self.stream.synchronize() @coro @@ -273,9 +277,14 @@ def recvGhosts(self): recv_ghost = self.d_recv_ghosts[neighbor_idx] else: self.h_ghosts[neighbor_idx] = ch.recv() - self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx], - stream=self.stream - ) + # self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx], + # stream=self.stream + # ) + charm.lib.CudaHtoD(self.d_ghosts[neighbor_idx].__cuda_array_interface__['data'][0], + self.h_ghosts[neighbor_idx].__array_interface__['data'][0], + self.d_ghosts[neighbor_idx].nbytes, 0 + ) + recv_ghost = self.d_ghosts[neighbor_idx] kernels.invokeUnpackingKernel(self.d_temperature, From b700cba0671b31f96990d9fb0f8e20192fd22e03 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 27 Jan 2021 21:36:54 -0500 Subject: [PATCH 074/107] temporary hard-code stream 0 --- charm4py/charmlib/charmlib_cython.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index c360c264..271af7c9 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -958,13 +958,13 @@ class CharmLib(object): future_id ) def CudaHtoD(self, long destAddr, long srcAddr, int nbytes, long streamAddr): - CkCUDAHtoD(destAddr, srcAddr,nbytes, ( streamAddr)[0]); + CkCUDAHtoD(destAddr, srcAddr,nbytes, 0); def CudaDtoH(self, long destAddr, long srcAddr, int nbytes, long streamAddr): - CkCUDADtoH(destAddr, srcAddr,int(nbytes), ( streamAddr)[0]); + CkCUDADtoH(destAddr, srcAddr,int(nbytes), 0); def CudaStreamSynchronize(self, long streamAddr): - CkCUDAStreamSynchronize((streamAddr)[0]) + CkCUDAStreamSynchronize(0) From 475540a84df998c0c26c6fa0997f11b2a0be2a8b Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Wed, 27 Jan 2021 22:23:07 -0500 Subject: [PATCH 075/107] Charm4py Jacobi3D: Update timer outputs, make default iter 100 --- examples/cuda/gpudirect/jacobi3d/block.py | 5 +++-- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py index 0c032a8c..636f38ca 100644 --- a/examples/cuda/gpudirect/jacobi3d/block.py +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -319,6 +319,7 @@ def run(self, done_future): if self.thisIndex == (0, 0, 0): elapsed_time = tend-tstart - print(f'Elapsed time: {round(elapsed_time,5)}') - print(f'Approximate time per iteration: {round(((elapsed_time/n_iters)*1e6),5)}') + print(f'Elapsed time: {round(elapsed_time,3)} s') + print(f'Average time per iteration: {round(((elapsed_time/n_iters)*1e3),3)} ms') + print(f'Communication time per iteration: {round(((comm_time/n_iters)*1e3),3)} ms') self.reduce(done_future) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index 2160ed7e..c759807c 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -11,7 +11,7 @@ class Defaults(Enum): GRID_WIDTH = 512 GRID_HEIGHT = 512 GRID_DEPTH = 512 - NUM_ITERS = 512 + NUM_ITERS = 100 WARMUP_ITERS = 10 USE_ZEROCOPY = False PRINT_ELEMENTS = False From c0e5398cdd63136db8b88baa959f7848a8cbb283 Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Wed, 27 Jan 2021 23:30:32 -0500 Subject: [PATCH 076/107] Charm4py Jacobi3D: Revert to old host-staging method to avoid errors --- examples/cuda/gpudirect/jacobi3d/block.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py index f16aa11d..98665981 100644 --- a/examples/cuda/gpudirect/jacobi3d/block.py +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -244,11 +244,13 @@ def updateAndPack(self): self.stream ) if not use_zerocopy: - # self.d_ghosts[i].copy_to_host(self.h_ghosts[i]) + self.d_ghosts[i].copy_to_host(self.h_ghosts[i], self.stream) + ''' charm.lib.CudaDtoH(self.h_ghosts[i].__array_interface__['data'][0], self.d_ghosts[i].__cuda_array_interface__['data'][0], self.d_ghosts[i].nbytes, 0 ) + ''' self.stream.synchronize() @@ -277,14 +279,15 @@ def recvGhosts(self): recv_ghost = self.d_recv_ghosts[neighbor_idx] else: self.h_ghosts[neighbor_idx] = ch.recv() - # self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx], - # stream=self.stream - # ) + self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx], + stream=self.stream + ) + ''' charm.lib.CudaHtoD(self.d_ghosts[neighbor_idx].__cuda_array_interface__['data'][0], self.h_ghosts[neighbor_idx].__array_interface__['data'][0], self.d_ghosts[neighbor_idx].nbytes, 0 ) - + ''' recv_ghost = self.d_ghosts[neighbor_idx] kernels.invokeUnpackingKernel(self.d_temperature, From 91be5a60f3fc3e1cf98a42fdb698e092602f3ca7 Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Wed, 27 Jan 2021 23:48:44 -0500 Subject: [PATCH 077/107] Revert back host-staging mechanism --- examples/cuda/gpudirect/jacobi3d/block.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py index 98665981..f96275a6 100644 --- a/examples/cuda/gpudirect/jacobi3d/block.py +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -216,12 +216,6 @@ def init_bounds(self, x, y, z): self.neighbors = neighbors - @coro - def sendGhosts(self): - for dir in range(kernels.DIR_COUNT): - if not self.bounds[dir]: - self.sendGhost(dir) - def updateAndPack(self): kernels.invokeJacobiKernel(self.d_temperature, self.d_new_temperature, @@ -244,16 +238,20 @@ def updateAndPack(self): self.stream ) if not use_zerocopy: - self.d_ghosts[i].copy_to_host(self.h_ghosts[i], self.stream) - ''' + #self.d_ghosts[i].copy_to_host(self.h_ghosts[i], self.stream) charm.lib.CudaDtoH(self.h_ghosts[i].__array_interface__['data'][0], self.d_ghosts[i].__cuda_array_interface__['data'][0], self.d_ghosts[i].nbytes, 0 ) - ''' self.stream.synchronize() + @coro + def sendGhosts(self): + for dir in range(kernels.DIR_COUNT): + if not self.bounds[dir]: + self.sendGhost(dir) + @coro def sendGhost(self, direction): send_ch = self.neighbor_channels[direction] @@ -279,6 +277,7 @@ def recvGhosts(self): recv_ghost = self.d_recv_ghosts[neighbor_idx] else: self.h_ghosts[neighbor_idx] = ch.recv() + ''' self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx], stream=self.stream ) @@ -287,7 +286,6 @@ def recvGhosts(self): self.h_ghosts[neighbor_idx].__array_interface__['data'][0], self.d_ghosts[neighbor_idx].nbytes, 0 ) - ''' recv_ghost = self.d_ghosts[neighbor_idx] kernels.invokeUnpackingKernel(self.d_temperature, From 3069ce263356a07f25258b34b4059014c0017a83 Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Thu, 28 Jan 2021 00:14:40 -0500 Subject: [PATCH 078/107] Fix recvGhost to work with more than 2 processes --- examples/cuda/gpudirect/jacobi3d/block.py | 66 ++++++++++++----------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py index f96275a6..b99b534c 100644 --- a/examples/cuda/gpudirect/jacobi3d/block.py +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -265,37 +265,41 @@ def sendGhost(self, direction): @coro def recvGhosts(self): - for ch in charm.iwait(self.active_neighbor_channels): - # remember: we set 'recv_direction' member - # directly in the initialization phase - neighbor_idx = ch.recv_direction - - if use_zerocopy: - ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[neighbor_idx], - post_buf_sizes = self.d_recv_ghosts_size[neighbor_idx] - ) - recv_ghost = self.d_recv_ghosts[neighbor_idx] - else: - self.h_ghosts[neighbor_idx] = ch.recv() - ''' - self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx], - stream=self.stream - ) - ''' - charm.lib.CudaHtoD(self.d_ghosts[neighbor_idx].__cuda_array_interface__['data'][0], - self.h_ghosts[neighbor_idx].__array_interface__['data'][0], - self.d_ghosts[neighbor_idx].nbytes, 0 - ) - recv_ghost = self.d_ghosts[neighbor_idx] - - kernels.invokeUnpackingKernel(self.d_temperature, - recv_ghost, - ch.recv_direction, - block_width, - block_height, - block_depth, - self.stream - ) + # Not using charm.iwait as it errors out with more than 2 processes + for dir in range(kernels.DIR_COUNT): + if not self.bounds[dir]: + self.recvGhost(dir) + + @coro + def recvGhost(self, direction): + recv_ch = self.neighbor_channels[direction] + + if use_zerocopy: + recv_ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[direction], + post_buf_sizes = self.d_recv_ghosts_size[direction] + ) + recv_ghost = self.d_recv_ghosts[direction] + else: + self.h_ghosts[direction] = recv_ch.recv() + ''' + self.d_ghosts[direction].copy_to_device(self.h_ghosts[direction], + stream=self.stream + ) + ''' + charm.lib.CudaHtoD(self.d_ghosts[direction].__cuda_array_interface__['data'][0], + self.h_ghosts[direction].__array_interface__['data'][0], + self.d_ghosts[direction].nbytes, 0 + ) + recv_ghost = self.d_ghosts[direction] + + kernels.invokeUnpackingKernel(self.d_temperature, + recv_ghost, + direction, + block_width, + block_height, + block_depth, + self.stream + ) self.stream.synchronize() @coro From 12483a018345688a5ef0e20930f4256564b9ad9e Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Thu, 28 Jan 2021 00:56:52 -0500 Subject: [PATCH 079/107] Fix chare dimension calculation error --- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index c759807c..2a2cacbe 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -159,7 +159,7 @@ def calc_num_chares_per_dim(num_chares, grid_w, grid_h, grid_d): while ipy <= nremain: if not nremain % ipy: ipz = nremain // ipy - surf = area[0] / ipx / ipy + area[1] / ipz + area[2] / ipy / ipz + surf = area[0] / ipx / ipy + area[1] / ipx / ipz + area[2] / ipy / ipz if surf < bestsurf: bestsurf = surf From b02d6d79b9e0e47f9bb7a0f1169c0d9c4d8e6a1e Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Thu, 28 Jan 2021 00:57:04 -0500 Subject: [PATCH 080/107] Add scripts --- .../gpudirect/jacobi3d/scripts/charm4py.sh | 46 ++++++++++++++ .../jacobi3d/scripts/process_times.py | 63 +++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100755 examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh create mode 100755 examples/cuda/gpudirect/jacobi3d/scripts/process_times.py diff --git a/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh b/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh new file mode 100755 index 00000000..2d07c54c --- /dev/null +++ b/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#BSUB -W 30 +#BSUB -P csc357 +#BSUB -nnodes 2 +#BSUB -J jacobi3d-charm4py-weak-n2 + +# These need to be changed between submissions +file=jacobi3d.py +n_nodes=2 +n_procs=$((n_nodes * 6)) +grid_width=3072 +grid_height=1536 +grid_depth=1536 + +# Function to display commands +exe() { echo "\$ $@" ; "$@" ; } + +cd $HOME/work/charm4py/examples/cuda/gpudirect/jacobi3d + +conda activate charm4py + +export LD_LIBRARY_PATH=$HOME/work/ucx/install/lib:$HOME/work/pmix-3.1.5/install/lib:/sw/summit/gdrcopy/2.0/lib64:$LD_LIBRARY_PATH +export UCX_MEMTYPE_CACHE=n + +ppn=1 +pemap="L0,4,8,84,88,92" +n_iters=100 +warmup_iters=10 + +echo "# Charm4py Jacobi3D Performance Benchmarking (GPUDirect off)" + +for iter in 1 2 3 +do + date + echo "# Run $iter" + exe jsrun -n$n_procs -a1 -c$ppn -g1 -K3 -r6 --smpiargs="-disable_gpu_hooks" python3 ./$file -x $grid_width -y $grid_height -z $grid_depth -w $warmup_iters -i $n_iters +ppn $ppn +pemap $pemap +done + +echo "# Charm4py Jacobi3D Performance Benchmarking (GPUDirect on)" + +for iter in 1 2 3 +do + date + echo "# Run $iter" + exe jsrun -n$n_procs -a1 -c$ppn -g1 -K3 -r6 --smpiargs="-disable_gpu_hooks" python3 ./$file -x $grid_width -y $grid_height -z $grid_depth -w $warmup_iters -i $n_iters +ppn $ppn +pemap $pemap -d +done diff --git a/examples/cuda/gpudirect/jacobi3d/scripts/process_times.py b/examples/cuda/gpudirect/jacobi3d/scripts/process_times.py new file mode 100755 index 00000000..0063b79d --- /dev/null +++ b/examples/cuda/gpudirect/jacobi3d/scripts/process_times.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +import os +import sys +import csv +import statistics + +if len(sys.argv) != 4: + print('Please use', sys.argv[0], '[job name] [start node count] [end node count]') + exit() + +job_name = sys.argv[1] +start_node_count = int(sys.argv[2]) +end_node_count = int(sys.argv[3]) + +csv_filename = job_name + '.csv' +csv_file = open(csv_filename, 'w', newline='') +writer = csv.writer(csv_file) +writer.writerow(['Number of GPUs', 'Charm4py-H-Total', 'error', 'Charm4py-H-Comm', 'error', 'Charm4py-D-Total', 'error', 'Charm4py-D-Comm', 'error']) + +def is_host(index): + return index % 6 == 0 or index % 6 == 1 or index % 6 == 2 + +node_count_list = [] +cur_node_count = start_node_count +while cur_node_count <= end_node_count: + node_count_list.append(cur_node_count) + cur_node_count *= 2 + +for node_count in node_count_list: + print('Node count:', str(node_count)) + total_str = 'grep -ir "Average time per" ' + job_name + '-n' + str(node_count) + '.* | cut -d " " -f5' + comm_str = 'grep -ir "Communication time" ' + job_name + '-n' + str(node_count) + '.* | cut -d " " -f5' + + total_stream = os.popen(total_str) + total_lines = total_stream.readlines() + total_times = list(map(lambda x: x, list(map(float, list(map(str.rstrip, total_lines)))))) + comm_stream = os.popen(comm_str) + comm_lines = comm_stream.readlines() + comm_times = list(map(lambda x: x, list(map(float, list(map(str.rstrip, comm_lines)))))) + + h_total_times = [total_times[i] for i in range(len(total_times)) if is_host(i)] + h_comm_times = [comm_times[i] for i in range(len(comm_times)) if is_host(i)] + d_total_times = [total_times[i] for i in range(len(total_times)) if not is_host(i)] + d_comm_times = [comm_times[i] for i in range(len(comm_times)) if not is_host(i)] + print('H total:', h_total_times) + print('H comm:', h_comm_times) + print('D total:', d_total_times) + print('D comm:', d_comm_times) + + h_total_avg = round(statistics.mean(h_total_times), 2) + h_total_stdev = round(statistics.stdev(h_total_times), 2) + h_comm_avg = round(statistics.mean(h_comm_times), 2) + h_comm_stdev = round(statistics.stdev(h_comm_times), 2) + d_total_avg = round(statistics.mean(d_total_times), 2) + d_total_stdev = round(statistics.stdev(d_total_times), 2) + d_comm_avg = round(statistics.mean(d_comm_times), 2) + d_comm_stdev = round(statistics.stdev(d_comm_times), 2) + print('H total avg:', h_total_avg, 'stdev:', h_total_stdev) + print('H comm avg:', h_comm_avg, 'stdev:', h_comm_stdev) + print('D total avg:', d_total_avg, 'stdev:', d_total_stdev) + print('D comm avg:', d_comm_avg, 'stdev:', d_comm_stdev) + + writer.writerow([str(node_count), str(h_total_avg), str(h_total_stdev), str(h_comm_avg), str(h_comm_stdev), str(d_total_avg), str(d_total_stdev), str(d_comm_avg), str(d_comm_stdev)]) From 20913a94e8be5256625f1c6a37d59b44427f9ece Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Thu, 28 Jan 2021 15:57:42 -0500 Subject: [PATCH 081/107] Update Charm4py script, need to use source activate instead of conda activate --- .../gpudirect/jacobi3d/scripts/charm4py.sh | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh b/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh index 2d07c54c..54fcaa78 100755 --- a/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh +++ b/examples/cuda/gpudirect/jacobi3d/scripts/charm4py.sh @@ -1,26 +1,28 @@ #!/bin/bash #BSUB -W 30 #BSUB -P csc357 -#BSUB -nnodes 2 -#BSUB -J jacobi3d-charm4py-weak-n2 +#BSUB -nnodes 256 +#BSUB -J jacobi3d-charm4py-strong-n256 # These need to be changed between submissions file=jacobi3d.py -n_nodes=2 +n_nodes=256 n_procs=$((n_nodes * 6)) grid_width=3072 -grid_height=1536 -grid_depth=1536 +grid_height=3072 +grid_depth=3072 # Function to display commands exe() { echo "\$ $@" ; "$@" ; } cd $HOME/work/charm4py/examples/cuda/gpudirect/jacobi3d -conda activate charm4py +#exe conda init bash +#exe conda activate charm4py +exe source activate charm4py -export LD_LIBRARY_PATH=$HOME/work/ucx/install/lib:$HOME/work/pmix-3.1.5/install/lib:/sw/summit/gdrcopy/2.0/lib64:$LD_LIBRARY_PATH -export UCX_MEMTYPE_CACHE=n +exe export LD_LIBRARY_PATH=$HOME/work/ucx/install/lib:$HOME/work/pmix-3.1.5/install/lib:/sw/summit/gdrcopy/2.0/lib64:$LD_LIBRARY_PATH +exe export UCX_MEMTYPE_CACHE=n ppn=1 pemap="L0,4,8,84,88,92" From 581dae957171fef7668e46804ccf4c695aa84f14 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 17 Mar 2021 10:21:06 -0500 Subject: [PATCH 082/107] Add iwait_map function to fix the issue --- charm4py/charm.py | 50 +++++++++++++++++++++++++++++++++++++++++++++ charm4py/threads.py | 6 +++++- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index a13935fe..b34f287f 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -28,6 +28,7 @@ from . import reduction from . import wait import array +import greenlet try: import numpy except ImportError: @@ -820,12 +821,61 @@ def iwait(self, objs): n -= 1 yield obj else: + print('waitready this') obj.waitReady(f) while n > 0: + print('thread paused') obj = self.threadMgr.pauseThread() + print('thread resumed', obj) n -= 1 + print('n', n) yield obj + def iwait_map(self, func, objs): + n = len(objs) + f = LocalFuture() + done_fut = LocalFuture() + remaining_grs = [n] + + def map_func(remaining, obj): + gr = greenlet.getcurrent() + gr.notify = gr.parent.notify + gr.obj = gr.parent.obj + gr.fu = 1 + func(obj) + remaining[0] -= 1 + + def gr_func(): + return map_func(remaining_grs, obj) + + for obj in objs: + if obj.ready(): + new_gr = greenlet.greenlet(gr_func) + n -= 1 + new_gr.switch() + else: + obj.waitReady(f) + while n > 0: + obj = self.threadMgr.pauseThread() + # if obj is None, then we are being resumed by a finishing greenlet + # and should pause + if obj is None: + continue + + new_gr = greenlet.greenlet(gr_func) + n -= 1 + obj = new_gr.switch() + + # if ret is not None, then we are being resumed + #by the thread manager because a new object is ready to receive + if obj: + new_gr = greenlet.greenlet(gr_func) + n -= 1 + new_gr.switch() + + while remaining_grs[0]: + self.threadMgr.pauseThread() + def wait(self, objs): for o in self.iwait(objs): pass diff --git a/charm4py/threads.py b/charm4py/threads.py index e05678cf..38343a2b 100644 --- a/charm4py/threads.py +++ b/charm4py/threads.py @@ -166,11 +166,15 @@ def pauseThread(self): if gr.notify: obj = gr.obj obj._thread_notify_target.threadPaused(obj._thread_notify_data) - if gr.parent != main_gr: + if False and gr.parent != main_gr: # this can happen with threaded chare constructors that are called # "inline" by Charm++ on the PE where the collection is created. # Initially it will switch back to the parent thread, but after that # we make the parent to be the main thread + # try: + # if gr.fu: + # return main_gr.switch() + # except: parent = gr.parent gr.parent = main_gr return parent.switch() From 65ad6116aeebbb68d8f9c2701b84d528e2809460 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 31 Mar 2021 16:52:47 -0400 Subject: [PATCH 083/107] update to use charm.iwait_map --- examples/cuda/gpudirect/jacobi3d/block.py | 58 +++++++++++++++----- examples/cuda/gpudirect/jacobi3d/jacobi3d.py | 2 +- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/examples/cuda/gpudirect/jacobi3d/block.py b/examples/cuda/gpudirect/jacobi3d/block.py index b99b534c..4732ecb7 100644 --- a/examples/cuda/gpudirect/jacobi3d/block.py +++ b/examples/cuda/gpudirect/jacobi3d/block.py @@ -246,13 +246,13 @@ def updateAndPack(self): self.stream.synchronize() - @coro def sendGhosts(self): + count = 0 for dir in range(kernels.DIR_COUNT): if not self.bounds[dir]: self.sendGhost(dir) + count += 1 - @coro def sendGhost(self, direction): send_ch = self.neighbor_channels[direction] @@ -263,19 +263,48 @@ def sendGhost(self, direction): else: send_ch.send(self.h_ghosts[direction]) - @coro def recvGhosts(self): - # Not using charm.iwait as it errors out with more than 2 processes - for dir in range(kernels.DIR_COUNT): - if not self.bounds[dir]: - self.recvGhost(dir) - - @coro - def recvGhost(self, direction): - recv_ch = self.neighbor_channels[direction] - + charm.iwait_map(self.recvGhost, self.active_neighbor_channels) + # for ch in charm.iwait(self.active_neighbor_channels): + # # remember: we set 'recv_direction' member + # # directly in the initialization phase + # neighbor_idx = ch.recv_direction + + # if use_zerocopy: + # ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[neighbor_idx], + # post_buf_sizes = self.d_recv_ghosts_size[neighbor_idx] + # ) + # recv_ghost = self.d_recv_ghosts[neighbor_idx] + # else: + # self.h_ghosts[neighbor_idx] = ch.recv() + # ''' + # self.d_ghosts[neighbor_idx].copy_to_device(self.h_ghosts[neighbor_idx], + # stream=self.stream + # ) + # ''' + # charm.lib.CudaHtoD(self.d_ghosts[neighbor_idx].__cuda_array_interface__['data'][0], + # self.h_ghosts[neighbor_idx].__array_interface__['data'][0], + # self.d_ghosts[neighbor_idx].nbytes, 0 + # ) + # recv_ghost = self.d_ghosts[neighbor_idx] + + # kernels.invokeUnpackingKernel(self.d_temperature, + # recv_ghost, + # ch.recv_direction, + # block_width, + # block_height, + # block_depth, + # self.stream + # ) + # # Not using charm.iwait as it errors out with more than 2 processes + # for dir in range(kernels.DIR_COUNT): + # if not self.bounds[dir]: + # self.recvGhost(dir) + + def recvGhost(self, recv_ch): + direction = recv_ch.recv_direction if use_zerocopy: - recv_ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[direction], + f = recv_ch.recv(post_buf_addresses = self.d_recv_ghosts_addr[direction], post_buf_sizes = self.d_recv_ghosts_size[direction] ) recv_ghost = self.d_recv_ghosts[direction] @@ -302,7 +331,7 @@ def recvGhost(self, direction): ) self.stream.synchronize() - @coro + # @coro def exchangeGhosts(self): self.d_temperature, self.d_new_temperature = \ self.d_new_temperature, self.d_temperature @@ -328,7 +357,6 @@ def run(self, done_future): if current_iter >= warmup_iters: comm_time += time.time() - comm_start_time - tend = time.time() if self.thisIndex == (0, 0, 0): diff --git a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py index 2a2cacbe..b3f9bcd7 100644 --- a/examples/cuda/gpudirect/jacobi3d/jacobi3d.py +++ b/examples/cuda/gpudirect/jacobi3d/jacobi3d.py @@ -128,13 +128,13 @@ def main(args): ).get() init_done_future = Future() + run_future = Future() block_proxy = Array(Block, dims=[n_chares_x, n_chares_y, n_chares_z], args = [init_done_future] ) init_done_future.get() - run_future = Future() block_proxy.run(run_future) run_future.get() charm.exit() From 69dd6f51d633159b26cd76e89989cc82ba02ff1d Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 31 Mar 2021 16:55:17 -0400 Subject: [PATCH 084/107] Fix charm.iwait_map bug --- charm4py/channel.py | 1 + charm4py/charm.py | 31 +++++++++++-------------------- charm4py/threads.py | 3 ++- 3 files changed, 14 insertions(+), 21 deletions(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index aaf058ec..74726b95 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -68,4 +68,5 @@ def recv(self): ret = self.recv_fut.get() self.recv_fut = None self.recv_seqno = (self.recv_seqno + 1) % CHAN_BUF_SIZE + return ret diff --git a/charm4py/charm.py b/charm4py/charm.py index b34f287f..8f847c78 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -816,25 +816,21 @@ def triggerCallable(self, tag): def iwait(self, objs): n = len(objs) f = LocalFuture() + for obj in objs: if obj.ready(): n -= 1 yield obj else: - print('waitready this') obj.waitReady(f) while n > 0: - print('thread paused') obj = self.threadMgr.pauseThread() - print('thread resumed', obj) n -= 1 - print('n', n) yield obj def iwait_map(self, func, objs): n = len(objs) f = LocalFuture() - done_fut = LocalFuture() remaining_grs = [n] def map_func(remaining, obj): @@ -852,27 +848,22 @@ def gr_func(): if obj.ready(): new_gr = greenlet.greenlet(gr_func) n -= 1 - new_gr.switch() + obj = new_gr.switch() + while obj: + assert isinstance(obj, Channel) + new_gr = greenlet.greenlet(gr_func) + n -= 1 + obj = new_gr.switch() else: obj.waitReady(f) while n > 0: obj = self.threadMgr.pauseThread() - # if obj is None, then we are being resumed by a finishing greenlet - # and should pause - if obj is None: - continue - - new_gr = greenlet.greenlet(gr_func) - n -= 1 - obj = new_gr.switch() - - # if ret is not None, then we are being resumed - #by the thread manager because a new object is ready to receive - if obj: + while obj: + assert isinstance(obj, Channel) new_gr = greenlet.greenlet(gr_func) n -= 1 - new_gr.switch() - + obj = new_gr.switch() + while remaining_grs[0]: self.threadMgr.pauseThread() diff --git a/charm4py/threads.py b/charm4py/threads.py index 38343a2b..fd7cec3e 100644 --- a/charm4py/threads.py +++ b/charm4py/threads.py @@ -179,7 +179,8 @@ def pauseThread(self): gr.parent = main_gr return parent.switch() else: - return main_gr.switch() + ret_val = main_gr.switch() + return ret_val def _resumeThread(self, gr, arg): """ Deposit a result or signal that a local entry method thread is waiting on, From afeaa6b3329aabfcb4e13dfda0ed1b2788ec1487 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 31 Mar 2021 16:00:51 -0500 Subject: [PATCH 085/107] no need for channel assertion --- charm4py/charm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index 8f847c78..f78f5188 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -850,7 +850,6 @@ def gr_func(): n -= 1 obj = new_gr.switch() while obj: - assert isinstance(obj, Channel) new_gr = greenlet.greenlet(gr_func) n -= 1 obj = new_gr.switch() @@ -859,7 +858,6 @@ def gr_func(): while n > 0: obj = self.threadMgr.pauseThread() while obj: - assert isinstance(obj, Channel) new_gr = greenlet.greenlet(gr_func) n -= 1 obj = new_gr.switch() From 0f34c1a0a4ef7cd0f8c13e87d07009e3eed18fef Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 21 Apr 2021 11:55:42 -0400 Subject: [PATCH 086/107] correctly differentiate between instances when GPU data sent with other types --- charm4py/channel.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index 4584c618..2e84d481 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -72,14 +72,25 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None, self.recv_seqno = (self.recv_seqno + 1) % CHAN_BUF_SIZE if post_buffers: - gpu_recv_bufs = ret[-1] - # ret = ret[:-1:1] + if isinstance(ret, tuple): + print(ret) + gpu_recv_bufs = ret[-1] + ret = ret[0:-1] + if len(ret) == 1: + ret = ret[0] + else: + gpu_recv_bufs = ret assert len(post_buffers) == len(gpu_recv_bufs) recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) recv_future.get() elif post_buf_addresses != None: - gpu_recv_bufs = ret - # ret = ret[:-1:1] + if isinstance(ret, tuple): + gpu_recv_bufs = ret[-1] + ret = ret[0:-1] + if len(ret) == 1: + ret = ret[0] + else: + gpu_recv_bufs = ret assert len(post_buf_addresses) == len(gpu_recv_bufs) assert post_buf_sizes recv_future = charm.getGPUDirectDataFromAddresses(post_buf_addresses, post_buf_sizes, gpu_recv_bufs, stream_ptrs) From 92f2cba4743a1baab59de72a13feef91f1661699 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 21 Apr 2021 11:56:49 -0400 Subject: [PATCH 087/107] numpy arrays can now be sent with GPU-direct data --- charm4py/charm.py | 1 + charm4py/charmlib/ccharm.pxd | 8 +++ charm4py/charmlib/charmlib_cython.pyx | 73 +++++++++++++++++++-------- 3 files changed, 60 insertions(+), 22 deletions(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index 4b2235d8..71df707f 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -134,6 +134,7 @@ def __init__(self): self.CkArraySend = self.lib.CkArraySend self.CkArraySendWithDeviceData = self.lib.CkArraySendWithDeviceData self.CkArraySendWithDeviceDataFromPointers = self.lib.CkArraySendWithDeviceDataFromPointers + self.CkCudaEnabled = self.lib.CkCudaEnabled self.reducers = reduction.ReducerContainer(self) self.redMgr = reduction.ReductionManager(self, self.reducers) self.mainchareRegistered = False diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index e11c204f..9f2e9a9c 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -82,6 +82,14 @@ cdef extern from "charm.h": long *devBufSizesInBytes, long *streamPtrs, int numDevBufs ); + void CkChareExtSendWithDeviceData_multi(int aid, int *idx, int ndims, + int epIdx, int num_bufs, char **bufs, + int *buf_sizes, + long *devBufPtrs, + long *devBufSizesInBytes, + long *streamPtrs, int numDevBufs + ); + void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int)); void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes, diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 271af7c9..b8ab34d8 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -477,6 +477,7 @@ class CharmLib(object): assert num_direct_buffers <= NUM_DCOPY_BUFS global gpu_direct_device_ptrs global gpu_direct_stream_ptrs + global cur_buf if stream_ptrs: for i in range(num_direct_buffers): @@ -484,12 +485,24 @@ class CharmLib(object): else: memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers) - CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0), - gpu_direct_device_ptrs, - gpu_direct_buff_sizes, - gpu_direct_stream_ptrs, - num_direct_buffers - ) + if cur_buf <= 1: + CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0), + gpu_direct_device_ptrs, + gpu_direct_buff_sizes, + gpu_direct_stream_ptrs, + num_direct_buffers + ) + else: + send_bufs[0] = msg0 + send_buf_sizes[0] = len(msg0) + CkChareExtSendWithDeviceData_multi(array_id, c_index, ndims, ep, + cur_buf, send_bufs, send_buf_sizes, + gpu_direct_device_ptrs, + gpu_direct_buff_sizes, + gpu_direct_stream_ptrs, + num_direct_buffers + ) + cur_buf = 1 gpu_direct_buf_idx = 0 def CkArraySendWithDeviceDataFromPointers(self, int array_id, index not None, int ep, @@ -499,7 +512,7 @@ class CharmLib(object): cdef int i = 0 cdef int ndims = len(index) - # assert ndims == 1 + global cur_buf for i in range(ndims): c_index[i] = index[i] msg0, dcopy = msg dcopy = None @@ -510,15 +523,28 @@ class CharmLib(object): else: memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs) - CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0), - gpu_src_ptrs.data.as_voidptr, - gpu_src_sizes.data.as_voidptr, - gpu_direct_stream_ptrs, - num_bufs - ) + if cur_buf <= 1: + CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0), + gpu_src_ptrs.data.as_voidptr, + gpu_src_sizes.data.as_voidptr, + gpu_direct_stream_ptrs, + num_bufs + ) + else: + send_bufs[0] = msg0 + send_buf_sizes[0] = len(msg0) + CkChareExtSendWithDeviceData_multi(array_id, c_index, ndims, ep, + cur_buf, send_bufs, send_buf_sizes, + gpu_src_ptrs.data.as_voidptr, + gpu_src_sizes.data.as_voidptr, + gpu_direct_stream_ptrs, + num_bufs + ) + cur_buf = 1 gpu_direct_buf_idx = 0 - + def CkCudaEnabled(self): + return bool(CkCudaEnabled()) def CkArraySend(self, int array_id, index not None, int ep, msg not None): global cur_buf @@ -860,13 +886,15 @@ class CharmLib(object): msg = emptyMsg else: direct_copy_hdr = [] # goes to header - args = list(msgArgs) global cur_buf global gpu_direct_buf_idx global gpu_direct_device_ptrs cur_buf = 1 gpu_direct_buf_idx = 0 - for i in range(len(args)): + # GPU-direct buffers will not be sent + args_to_send = list() + n_gpu_bufs = 0 + for i in range(len(msgArgs)): arg = msgArgs[i] if CkCudaEnabled() and hasattr(arg, '__cuda_array_interface__'): if pack_gpu: @@ -878,32 +906,33 @@ class CharmLib(object): gpu_direct_buff_sizes[gpu_direct_buf_idx] = arg.nbytes cuda_dev_info = True gpu_direct_buf_idx += 1 - args[i] = None # TODO: should this be done? + n_gpu_bufs += 1 continue elif isinstance(arg, np.ndarray) and not arg.dtype.hasobject: np_array = arg nbytes = np_array.nbytes - direct_copy_hdr.append((i, 2, (arg.shape, np_array.dtype.name), nbytes)) + direct_copy_hdr.append((i-n_gpu_bufs, 2, (arg.shape, np_array.dtype.name), nbytes)) send_bufs[cur_buf] = np_array.data elif isinstance(arg, bytes): nbytes = len(arg) - direct_copy_hdr.append((i, 0, (), nbytes)) + direct_copy_hdr.append((i-n_gpu_bufs, 0, (), nbytes)) send_bufs[cur_buf] = arg elif isinstance(arg, array.array): a = arg #nbytes = arg.buffer_info()[1] * arg.itemsize nbytes = len(a) * a.itemsize # NOTE that cython's array C interface doesn't expose itemsize attribute - direct_copy_hdr.append((i, 1, (a.typecode), nbytes)) + direct_copy_hdr.append((i-n_gpu_bufs, 1, (a.typecode), nbytes)) send_bufs[cur_buf] = a.data.as_voidptr else: + args_to_send.append(arg) continue - args[i] = None # will direct-copy this arg so remove from args list + args_to_send.append(None) send_buf_sizes[cur_buf] = nbytes if PROFILING: dcopy_size += nbytes cur_buf += 1 if len(direct_copy_hdr) > 0: header[b'dcopy'] = direct_copy_hdr try: - msg = dumps((header, args), PICKLE_PROTOCOL) + msg = dumps((header, args_to_send), PICKLE_PROTOCOL) except: global cur_buf global gpu_direct_buf_idx From 760e7162843f6ceff7e4922fab7cb36661221312 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 22 Apr 2021 14:20:12 -0400 Subject: [PATCH 088/107] removed debug print --- charm4py/channel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index 2e84d481..22e29efb 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -73,7 +73,6 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None, if post_buffers: if isinstance(ret, tuple): - print(ret) gpu_recv_bufs = ret[-1] ret = ret[0:-1] if len(ret) == 1: From c254008dddfab68169119874e48c691ff1782363 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 22 Apr 2021 14:20:45 -0400 Subject: [PATCH 089/107] no need for different function when buffers + gpu arrays sent in same message --- charm4py/charmlib/ccharm.pxd | 19 +++++---------- charm4py/charmlib/charmlib_cython.pyx | 34 +++++++-------------------- 2 files changed, 15 insertions(+), 38 deletions(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index 9f2e9a9c..12db2492 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -76,19 +76,12 @@ cdef extern from "charm.h": int CkCudaEnabled(); int CUDAPointerOnDevice(const void *ptr); void CkChareExtSendWithDeviceData(int aid, int *idx, int ndims, - int epIdx, int num_bufs, char *msg, - int msgSize, - long *devBufPtrs, - long *devBufSizesInBytes, - long *streamPtrs, int numDevBufs - ); - void CkChareExtSendWithDeviceData_multi(int aid, int *idx, int ndims, - int epIdx, int num_bufs, char **bufs, - int *buf_sizes, - long *devBufPtrs, - long *devBufSizesInBytes, - long *streamPtrs, int numDevBufs - ); + int epIdx, int num_bufs, char **bufs, + int *buf_sizes, + long *devBufPtrs, + long *devBufSizesInBytes, + long *streamPtrs, int numDevBufs + ); void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int)); diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index b8ab34d8..58951fb1 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -485,23 +485,15 @@ class CharmLib(object): else: memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers) - if cur_buf <= 1: - CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0), + send_bufs[0] = msg0 + send_buf_sizes[0] = len(msg0) + CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, + cur_buf, send_bufs, send_buf_sizes, gpu_direct_device_ptrs, gpu_direct_buff_sizes, gpu_direct_stream_ptrs, num_direct_buffers - ) - else: - send_bufs[0] = msg0 - send_buf_sizes[0] = len(msg0) - CkChareExtSendWithDeviceData_multi(array_id, c_index, ndims, ep, - cur_buf, send_bufs, send_buf_sizes, - gpu_direct_device_ptrs, - gpu_direct_buff_sizes, - gpu_direct_stream_ptrs, - num_direct_buffers - ) + ) cur_buf = 1 gpu_direct_buf_idx = 0 @@ -523,23 +515,15 @@ class CharmLib(object): else: memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs) - if cur_buf <= 1: - CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, 1, msg0, len(msg0), + send_bufs[0] = msg0 + send_buf_sizes[0] = len(msg0) + CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, + cur_buf, send_bufs, send_buf_sizes, gpu_src_ptrs.data.as_voidptr, gpu_src_sizes.data.as_voidptr, gpu_direct_stream_ptrs, num_bufs ) - else: - send_bufs[0] = msg0 - send_buf_sizes[0] = len(msg0) - CkChareExtSendWithDeviceData_multi(array_id, c_index, ndims, ep, - cur_buf, send_bufs, send_buf_sizes, - gpu_src_ptrs.data.as_voidptr, - gpu_src_sizes.data.as_voidptr, - gpu_direct_stream_ptrs, - num_bufs - ) cur_buf = 1 gpu_direct_buf_idx = 0 From 75f358d75d226ca5828446f48b00c972bcdb66bd Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 22 Apr 2021 14:28:25 -0400 Subject: [PATCH 090/107] tests sending one device array --- tests/cuda/single_array.py | 108 +++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 tests/cuda/single_array.py diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py new file mode 100644 index 00000000..f720c439 --- /dev/null +++ b/tests/cuda/single_array.py @@ -0,0 +1,108 @@ +from charm4py import charm, Chare, Array, coro, Future, Channel, Group, ArrayMap +import time +import numpy as np +from numba import cuda +import array + + +class A(Chare): + def __init__(self, msg_size): + self.msg_size = msg_size + + + @coro + def run(self, done_future, addr_optimization = False): + partner = self.thisProxy[int(not self.thisIndex[0])] + partner_channel = Channel(self, partner) + + device_data = cuda.device_array(self.msg_size, dtype='int8') + # if addr_optimization: + d_addr = array.array('L', [0]) + d_size = array.array('L', [0]) + + d_addr[0] = device_data.__cuda_array_interface__['data'][0] + d_size[0] = device_data.nbytes + + my_stream = cuda.stream() + + if self.thisIndex[0]: + host_data = np.zeros(self.msg_size, dtype='int8') + host_data.fill(5) + device_data.copy_to_device(host_data) + if addr_optimization: + partner_channel.send(1, 2, "hello", + np.ones(self.msg_size, dtype='int8'), + gpu_src_ptrs = d_addr, gpu_src_sizes = d_size + ) + p_data = partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size) + else: + partner_channel.send(1,2, "hello", + device_data, + np.ones(self.msg_size, dtype='int8') + ) + p_data = partner_channel.recv(device_data) + + assert p_data == (2, 3) + h_ary = device_data.copy_to_host() + assert np.array_equal(h_ary, host_data) + + if addr_optimization: + partner_channel.send(gpu_src_ptrs = d_addr, gpu_src_sizes = d_size) + partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size) + else: + partner_channel.send(device_data) + partner_channel.recv(device_data) + + h_ary = device_data.copy_to_host() + assert np.array_equal(h_ary, host_data) + else: + h_data = np.ones(self.msg_size, dtype='int8') + if addr_optimization: + p_data = partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size) + else: + p_data = partner_channel.recv(device_data) + p_data, p_host_arr = p_data[0:-1], p_data[-1] + recvd = device_data.copy_to_host() + + compare = np.zeros(self.msg_size, dtype='int8') + compare.fill(5) + assert np.array_equal(recvd, compare) + assert np.array_equal(np.ones(self.msg_size, dtype='int8'), p_host_arr) + assert p_data == (1, 2, "hello") + + if addr_optimization: + partner_channel.send(2,3, gpu_src_ptrs = d_addr, gpu_src_sizes = d_size) + else: + partner_channel.send(2,3, device_data) + + if addr_optimization: + partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size) + partner_channel.send(gpu_src_ptrs = d_addr, gpu_src_sizes = d_size) + else: + partner_channel.recv(device_data) + partner_channel.send(device_data) + + + self.reduce(done_future) + +class ArrMap(ArrayMap): + def procNum(self, index): + return index[0] % 2 + +def main(args): + # if this is not a cuda-aware build, + # vacuously pass the test + if not charm.CkCudaEnabled(): + print("WARNING: Charm4Py was not build with CUDA-enabled Charm++. " + "GPU-Direct functionality will not be tested" + ) + charm.exit(0) + + peMap = Group(ArrMap) + chares = Array(A, 2, args=[8192], map = peMap) + done_fut = Future() + chares.run(done_fut, addr_optimization = False) + done_fut.get() + charm.exit(0) + +charm.start(main) From 7ee684e9d62942d3c7ecdd884c8c3864011ad3d9 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 22 Apr 2021 14:32:44 -0400 Subject: [PATCH 091/107] flake8 compliance --- tests/cuda/single_array.py | 53 ++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py index f720c439..526aaf92 100644 --- a/tests/cuda/single_array.py +++ b/tests/cuda/single_array.py @@ -1,5 +1,4 @@ from charm4py import charm, Chare, Array, coro, Future, Channel, Group, ArrayMap -import time import numpy as np from numba import cuda import array @@ -9,9 +8,8 @@ class A(Chare): def __init__(self, msg_size): self.msg_size = msg_size - @coro - def run(self, done_future, addr_optimization = False): + def run(self, done_future, addr_optimization=False): partner = self.thisProxy[int(not self.thisIndex[0])] partner_channel = Channel(self, partner) @@ -23,8 +21,6 @@ def run(self, done_future, addr_optimization = False): d_addr[0] = device_data.__cuda_array_interface__['data'][0] d_size[0] = device_data.nbytes - my_stream = cuda.stream() - if self.thisIndex[0]: host_data = np.zeros(self.msg_size, dtype='int8') host_data.fill(5) @@ -32,11 +28,13 @@ def run(self, done_future, addr_optimization = False): if addr_optimization: partner_channel.send(1, 2, "hello", np.ones(self.msg_size, dtype='int8'), - gpu_src_ptrs = d_addr, gpu_src_sizes = d_size + gpu_src_ptrs=d_addr, gpu_src_sizes=d_size ) - p_data = partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size) + p_data = partner_channel.recv(post_buf_addresses=d_addr, + post_buf_sizes=d_size + ) else: - partner_channel.send(1,2, "hello", + partner_channel.send(1, 2, "hello", device_data, np.ones(self.msg_size, dtype='int8') ) @@ -47,8 +45,10 @@ def run(self, done_future, addr_optimization = False): assert np.array_equal(h_ary, host_data) if addr_optimization: - partner_channel.send(gpu_src_ptrs = d_addr, gpu_src_sizes = d_size) - partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size) + partner_channel.send(gpu_src_ptrs=d_addr, gpu_src_sizes=d_size) + partner_channel.recv(post_buf_addresses=d_addr, + post_buf_sizes=d_size + ) else: partner_channel.send(device_data) partner_channel.recv(device_data) @@ -56,9 +56,10 @@ def run(self, done_future, addr_optimization = False): h_ary = device_data.copy_to_host() assert np.array_equal(h_ary, host_data) else: - h_data = np.ones(self.msg_size, dtype='int8') if addr_optimization: - p_data = partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size) + p_data = partner_channel.recv(post_buf_addresses=d_addr, + post_buf_sizes=d_size + ) else: p_data = partner_channel.recv(device_data) p_data, p_host_arr = p_data[0:-1], p_data[-1] @@ -67,28 +68,35 @@ def run(self, done_future, addr_optimization = False): compare = np.zeros(self.msg_size, dtype='int8') compare.fill(5) assert np.array_equal(recvd, compare) - assert np.array_equal(np.ones(self.msg_size, dtype='int8'), p_host_arr) + assert np.array_equal(np.ones(self.msg_size, dtype='int8'), + p_host_arr + ) assert p_data == (1, 2, "hello") if addr_optimization: - partner_channel.send(2,3, gpu_src_ptrs = d_addr, gpu_src_sizes = d_size) + partner_channel.send(2, 3, gpu_src_ptrs=d_addr, + gpu_src_sizes=d_size + ) else: - partner_channel.send(2,3, device_data) + partner_channel.send(2, 3, device_data) if addr_optimization: - partner_channel.recv(post_buf_addresses = d_addr, post_buf_sizes = d_size) - partner_channel.send(gpu_src_ptrs = d_addr, gpu_src_sizes = d_size) + partner_channel.recv(post_buf_addresses=d_addr, + post_buf_sizes=d_size + ) + partner_channel.send(gpu_src_ptrs=d_addr, gpu_src_sizes=d_size) else: partner_channel.recv(device_data) partner_channel.send(device_data) - self.reduce(done_future) + class ArrMap(ArrayMap): def procNum(self, index): return index[0] % 2 + def main(args): # if this is not a cuda-aware build, # vacuously pass the test @@ -99,10 +107,15 @@ def main(args): charm.exit(0) peMap = Group(ArrMap) - chares = Array(A, 2, args=[8192], map = peMap) + chares = Array(A, 2, args=[8192], map=peMap) + done_fut = Future() + chares.run(done_fut, addr_optimization=False) + done_fut.get() + done_fut = Future() - chares.run(done_fut, addr_optimization = False) + chares.run(done_fut, addr_optimization=True) done_fut.get() charm.exit(0) + charm.start(main) From 4f84baaa66f6b97636694781247c55511b7523e0 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 22 Apr 2021 15:27:34 -0400 Subject: [PATCH 092/107] fixed hangup when multiple device arrays are sent --- charm4py/charm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index 71df707f..afc67a34 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -357,7 +357,8 @@ def unpackMsg(self, msg, dcopy_start, dest_obj): return header, args def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs): - return_fut = self.Future(len(post_buffers)) + # this future will only be satisfied when all buffers have been received + return_fut = self.Future() post_buf_data = [getDeviceDataAddress(buf) for buf in post_buffers] post_buf_sizes = [getDeviceDataSizeInBytes(buf) for buf in post_buffers] if not stream_ptrs: @@ -366,7 +367,8 @@ def getGPUDirectData(self, post_buffers, remote_bufs, stream_ptrs): return return_fut def getGPUDirectDataFromAddresses(self, post_buf_ptrs, post_buf_sizes, remote_bufs, stream_ptrs): - return_fut = self.Future(len(post_buf_ptrs)) + # this future will only be satisfied when all buffers have been received + return_fut = self.Future() if not stream_ptrs: stream_ptrs = array.array('L', [0] * len(post_buf_ptrs)) self.lib.getGPUDirectDataFromAddresses(post_buf_ptrs, post_buf_sizes, remote_bufs, stream_ptrs, return_fut) From 49c82a19e3faaf35a70b140e600ba1a7a185b59b Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 22 Apr 2021 15:46:34 -0400 Subject: [PATCH 093/107] remove comment --- tests/cuda/single_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py index 526aaf92..de0606c3 100644 --- a/tests/cuda/single_array.py +++ b/tests/cuda/single_array.py @@ -14,7 +14,7 @@ def run(self, done_future, addr_optimization=False): partner_channel = Channel(self, partner) device_data = cuda.device_array(self.msg_size, dtype='int8') - # if addr_optimization: + d_addr = array.array('L', [0]) d_size = array.array('L', [0]) From f9ffaf665b60ed0185e147e3be9fc90f9de7e129 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 22 Apr 2021 15:46:53 -0400 Subject: [PATCH 094/107] change != None to is not None --- charm4py/channel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index 22e29efb..15a407d1 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -79,10 +79,11 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None, ret = ret[0] else: gpu_recv_bufs = ret + assert len(post_buffers) == len(gpu_recv_bufs) recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) recv_future.get() - elif post_buf_addresses != None: + elif post_buf_addresses is not None: if isinstance(ret, tuple): gpu_recv_bufs = ret[-1] ret = ret[0:-1] From 46851e68a96dfa69e95cfc354bc4f39a6979790b Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 22 Apr 2021 15:47:35 -0400 Subject: [PATCH 095/107] tests for multiple arrays --- tests/cuda/multi_array.py | 85 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 tests/cuda/multi_array.py diff --git a/tests/cuda/multi_array.py b/tests/cuda/multi_array.py new file mode 100644 index 00000000..4e338b4e --- /dev/null +++ b/tests/cuda/multi_array.py @@ -0,0 +1,85 @@ +from charm4py import charm, Chare, Array, coro, Future, Channel, Group, ArrayMap +import numpy as np +from numba import cuda +import array + + +class A(Chare): + def __init__(self, msg_size): + self.msg_size = msg_size + + @coro + def run(self, done_future, addr_optimization=False): + partner = self.thisProxy[int(not self.thisIndex[0])] + partner_channel = Channel(self, partner) + + device_data = cuda.device_array(self.msg_size, dtype='int8') + device_data2 = cuda.device_array(self.msg_size, dtype='int8') + # if addr_optimization: + d_addr = array.array('L', [0, 0]) + d_size = array.array('L', [0, 0]) + + d_addr[0] = device_data.__cuda_array_interface__['data'][0] + d_addr[1] = device_data2.__cuda_array_interface__['data'][0] + + d_size[0] = device_data.nbytes + d_size[1] = device_data2.nbytes + + host_array = np.array(self.msg_size, dtype='int32') + host_array.fill(42) + + if self.thisIndex[0]: + h1 = np.ones(self.msg_size, dtype='int8') + h2 = np.zeros(self.msg_size, dtype='int8') + device_data.copy_to_device(h1) + device_data2.copy_to_device(h2) + if addr_optimization: + partner_channel.send(20, host_array, gpu_src_ptrs=d_addr, + gpu_src_sizes=d_size + ) + else: + partner_channel.send(20, host_array, device_data, device_data2) + else: + if addr_optimization: + f, g = partner_channel.recv(device_data, device_data2) + else: + f, g = partner_channel.recv(post_buf_addresses=d_addr, + post_buf_sizes=d_addr + ) + h1 = device_data.copy_to_host() + h2 = device_data2.copy_to_host() + + assert f == 20 + assert np.array_equal(host_array, g) + assert np.array_equal(h1, np.ones(self.msg_size, dtype='int8')) + assert np.array_equal(h2, np.zeros(self.msg_size, dtype='int8')) + self.reduce(done_future) + + +class ArrMap(ArrayMap): + def procNum(self, index): + return index[0] % 2 + + +def main(args): + # if this is not a cuda-aware build, + # vacuously pass the test + if not charm.CkCudaEnabled(): + print("WARNING: Charm4Py was not build with CUDA-enabled Charm++. " + "GPU-Direct functionality will not be tested" + ) + charm.exit(0) + + peMap = Group(ArrMap) + chares = Array(A, 2, args=[8192], map=peMap) + done_fut = Future() + chares.run(done_fut, addr_optimization=False) + done_fut.get() + + done_fut = Future() + chares.run(done_fut, addr_optimization=True) + done_fut.get() + charm.exit(0) + + +charm.start(main) From e730942b0d0fea4dfd984980ad9776b567d51fb9 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Sat, 24 Apr 2021 15:08:44 -0400 Subject: [PATCH 096/107] sizes should be int to match charm++ side --- charm4py/charmlib/ccharm.pxd | 4 +-- charm4py/charmlib/charmlib_cython.pyx | 46 +++++++++++++-------------- tests/cuda/multi_array.py | 12 ++++--- tests/cuda/single_array.py | 4 +-- 4 files changed, 34 insertions(+), 32 deletions(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index 12db2492..3d7e40d5 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -79,12 +79,12 @@ cdef extern from "charm.h": int epIdx, int num_bufs, char **bufs, int *buf_sizes, long *devBufPtrs, - long *devBufSizesInBytes, + int *devBufSizesInBytes, long *streamPtrs, int numDevBufs ); - void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, long*, void *, int, char*, int)); + void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, int*, void *, int, char*, int)); void CkGetGPUDirectData(int numBuffers, void *recvBufPtrs, int *arrSizes, void *remoteBufInfo, void *streamPtrs, int futureId); diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 58951fb1..b54044ff 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -309,7 +309,7 @@ cdef c_type_table_typecodes = [None] * 13 cdef int c_type_table_sizes[13] cdef int[SECTION_MAX_BFACTOR] section_children cdef long[NUM_DCOPY_BUFS] gpu_direct_device_ptrs -cdef long[NUM_DCOPY_BUFS] gpu_direct_buff_sizes +cdef int[NUM_DCOPY_BUFS] gpu_direct_buff_sizes cdef long[NUM_DCOPY_BUFS] gpu_direct_stream_ptrs cdef object charm @@ -485,16 +485,16 @@ class CharmLib(object): else: memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers) - send_bufs[0] = msg0 - send_buf_sizes[0] = len(msg0) - CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, - cur_buf, send_bufs, send_buf_sizes, - gpu_direct_device_ptrs, - gpu_direct_buff_sizes, - gpu_direct_stream_ptrs, - num_direct_buffers - ) - cur_buf = 1 + send_bufs[0] = msg0 + send_buf_sizes[0] = len(msg0) + CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, + cur_buf, send_bufs, send_buf_sizes, + gpu_direct_device_ptrs, + gpu_direct_buff_sizes, + gpu_direct_stream_ptrs, + num_direct_buffers + ) + cur_buf = 1 gpu_direct_buf_idx = 0 def CkArraySendWithDeviceDataFromPointers(self, int array_id, index not None, int ep, @@ -515,16 +515,16 @@ class CharmLib(object): else: memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs) - send_bufs[0] = msg0 - send_buf_sizes[0] = len(msg0) - CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, - cur_buf, send_bufs, send_buf_sizes, - gpu_src_ptrs.data.as_voidptr, - gpu_src_sizes.data.as_voidptr, - gpu_direct_stream_ptrs, - num_bufs - ) - cur_buf = 1 + send_bufs[0] = msg0 + send_buf_sizes[0] = len(msg0) + CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, + cur_buf, send_bufs, send_buf_sizes, + gpu_src_ptrs.data.as_voidptr, + gpu_src_sizes.data.as_voidptr, + gpu_direct_stream_ptrs, + num_bufs + ) + cur_buf = 1 gpu_direct_buf_idx = 0 def CkCudaEnabled(self): @@ -933,8 +933,8 @@ class CharmLib(object): def getGPUDirectData(self, list post_buf_data, list post_buf_sizes, array.array remote_bufs, list stream_ptrs, return_fut): cdef int num_buffers = len(post_buf_data) cdef int future_id = return_fut.fid - cdef array.array int_array_template = array.array('i', []) cdef array.array long_array_template = array.array('L', []) + cdef array.array int_array_template = array.array('i', []) cdef array.array recv_buf_sizes cdef array.array recv_buf_ptrs # pointers from the remote that we will be issuing Rgets for @@ -1034,7 +1034,7 @@ cdef void recvArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int msgSize, charm.handleGeneralError() cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numDevBuffs, - long *devBufSizes, void *devBufs, int msgSize, + int *devBufSizes, void *devBufs, int msgSize, char *msg, int dcopy_start): cdef int idx = 0 diff --git a/tests/cuda/multi_array.py b/tests/cuda/multi_array.py index 4e338b4e..4dc26c74 100644 --- a/tests/cuda/multi_array.py +++ b/tests/cuda/multi_array.py @@ -17,7 +17,7 @@ def run(self, done_future, addr_optimization=False): device_data2 = cuda.device_array(self.msg_size, dtype='int8') # if addr_optimization: d_addr = array.array('L', [0, 0]) - d_size = array.array('L', [0, 0]) + d_size = array.array('i', [0, 0]) d_addr[0] = device_data.__cuda_array_interface__['data'][0] d_addr[1] = device_data2.__cuda_array_interface__['data'][0] @@ -37,15 +37,17 @@ def run(self, done_future, addr_optimization=False): partner_channel.send(20, host_array, gpu_src_ptrs=d_addr, gpu_src_sizes=d_size ) + partner_channel.recv() else: partner_channel.send(20, host_array, device_data, device_data2) else: if addr_optimization: - f, g = partner_channel.recv(device_data, device_data2) - else: f, g = partner_channel.recv(post_buf_addresses=d_addr, - post_buf_sizes=d_addr + post_buf_sizes=d_size ) + else: + f, g = partner_channel.recv(device_data, device_data2) + partner_channel.send(1) h1 = device_data.copy_to_host() h2 = device_data2.copy_to_host() @@ -71,7 +73,7 @@ def main(args): charm.exit(0) peMap = Group(ArrMap) - chares = Array(A, 2, args=[8192], map=peMap) + chares = Array(A, 2, args=[(1<<30)], map=peMap) done_fut = Future() chares.run(done_fut, addr_optimization=False) done_fut.get() diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py index de0606c3..323daa88 100644 --- a/tests/cuda/single_array.py +++ b/tests/cuda/single_array.py @@ -16,7 +16,7 @@ def run(self, done_future, addr_optimization=False): device_data = cuda.device_array(self.msg_size, dtype='int8') d_addr = array.array('L', [0]) - d_size = array.array('L', [0]) + d_size = array.array('i', [0]) d_addr[0] = device_data.__cuda_array_interface__['data'][0] d_size[0] = device_data.nbytes @@ -107,7 +107,7 @@ def main(args): charm.exit(0) peMap = Group(ArrMap) - chares = Array(A, 2, args=[8192], map=peMap) + chares = Array(A, 2, args=[1<<20], map=peMap) done_fut = Future() chares.run(done_fut, addr_optimization=False) done_fut.get() From 491de28acf898402fcb4de7875efba993f422032 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 26 Apr 2021 11:32:20 -0400 Subject: [PATCH 097/107] streams supported at charm4py layer --- charm4py/charmlib/charmlib_cython.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index b54044ff..8acdbd53 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -463,7 +463,7 @@ class CharmLib(object): cur_buf = 1 def CkArraySendWithDeviceData(self, int array_id, index not None, int ep, - msg not None, list stream_ptrs): + msg not None, stream_ptrs): global gpu_direct_buf_idx cdef int i = 0 @@ -479,10 +479,10 @@ class CharmLib(object): global gpu_direct_stream_ptrs global cur_buf - if stream_ptrs: + if stream_ptrs and isinstance(stream_ptrs, list): for i in range(num_direct_buffers): gpu_direct_stream_ptrs[i] = stream_ptrs[i] - else: + elif not stream_ptrs: memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers) send_bufs[0] = msg0 @@ -500,7 +500,7 @@ class CharmLib(object): def CkArraySendWithDeviceDataFromPointers(self, int array_id, index not None, int ep, msg not None, array.array gpu_src_ptrs, array.array gpu_src_sizes, - list stream_ptrs, int num_bufs): + stream_ptrs, int num_bufs): cdef int i = 0 cdef int ndims = len(index) @@ -509,7 +509,7 @@ class CharmLib(object): msg0, dcopy = msg dcopy = None - if stream_ptrs: + if stream_ptrs and isinstance(stream_ptrs, list): for i in range(num_bufs): gpu_direct_stream_ptrs[i] = stream_ptrs[i] else: From fbcef04d26578fc530fc3b41f6137ad6a35895c9 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 26 Apr 2021 11:32:31 -0400 Subject: [PATCH 098/107] use streams when provided --- tests/cuda/single_array.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py index 323daa88..e53276f0 100644 --- a/tests/cuda/single_array.py +++ b/tests/cuda/single_array.py @@ -21,6 +21,9 @@ def run(self, done_future, addr_optimization=False): d_addr[0] = device_data.__cuda_array_interface__['data'][0] d_size[0] = device_data.nbytes + my_stream = cuda.stream() + stream_addr = array.array('L', [my_stream.handle.value]) + if self.thisIndex[0]: host_data = np.zeros(self.msg_size, dtype='int8') host_data.fill(5) @@ -28,15 +31,18 @@ def run(self, done_future, addr_optimization=False): if addr_optimization: partner_channel.send(1, 2, "hello", np.ones(self.msg_size, dtype='int8'), - gpu_src_ptrs=d_addr, gpu_src_sizes=d_size + gpu_src_ptrs=d_addr, gpu_src_sizes=d_size, + stream_ptrs=stream_addr ) p_data = partner_channel.recv(post_buf_addresses=d_addr, - post_buf_sizes=d_size + post_buf_sizes=d_size, + stream_ptrs=stream_addr ) else: partner_channel.send(1, 2, "hello", device_data, - np.ones(self.msg_size, dtype='int8') + np.ones(self.msg_size, dtype='int8'), + stream_ptrs=stream_addr ) p_data = partner_channel.recv(device_data) From f7d9083f75c079d156e8d66abddd2c9ad80c0698 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Mon, 26 Apr 2021 12:44:43 -0400 Subject: [PATCH 099/107] device send function array-specific --- charm4py/charmlib/ccharm.pxd | 2 +- charm4py/charmlib/charmlib_cython.pyx | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index 3d7e40d5..902d2127 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -75,7 +75,7 @@ cdef extern from "charm.h": # TODO: Organize these to place them near their related functions int CkCudaEnabled(); int CUDAPointerOnDevice(const void *ptr); - void CkChareExtSendWithDeviceData(int aid, int *idx, int ndims, + void CkArrayExtSendWithDeviceData(int aid, int *idx, int ndims, int epIdx, int num_bufs, char **bufs, int *buf_sizes, long *devBufPtrs, diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 8acdbd53..a07a14b2 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -487,7 +487,7 @@ class CharmLib(object): send_bufs[0] = msg0 send_buf_sizes[0] = len(msg0) - CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, + CkArrayExtSendWithDeviceData(array_id, c_index, ndims, ep, cur_buf, send_bufs, send_buf_sizes, gpu_direct_device_ptrs, gpu_direct_buff_sizes, @@ -517,7 +517,7 @@ class CharmLib(object): send_bufs[0] = msg0 send_buf_sizes[0] = len(msg0) - CkChareExtSendWithDeviceData(array_id, c_index, ndims, ep, + CkArrayExtSendWithDeviceData(array_id, c_index, ndims, ep, cur_buf, send_bufs, send_buf_sizes, gpu_src_ptrs.data.as_voidptr, gpu_src_sizes.data.as_voidptr, @@ -787,7 +787,7 @@ class CharmLib(object): registerChareMsgRecvExtCallback(recvChareMsg) registerGroupMsgRecvExtCallback(recvGroupMsg) registerArrayMsgRecvExtCallback(recvArrayMsg) - registerArrayMsgGPUDirectRecvExtCallback(recvGPUDirectMsg) + registerArrayMsgGPUDirectRecvExtCallback(recvGPUDirectArrayMsg) registerArrayBcastRecvExtCallback(recvArrayBcast) registerArrayMapProcNumExtCallback(arrayMapProcNum) registerArrayElemJoinExtCallback(arrayElemJoin) @@ -1033,9 +1033,9 @@ cdef void recvArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int msgSize, except: charm.handleGeneralError() -cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numDevBuffs, - int *devBufSizes, void *devBufs, int msgSize, - char *msg, int dcopy_start): +cdef void recvGPUDirectArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int numDevBuffs, + int *devBufSizes, void *devBufs, int msgSize, + char *msg, int dcopy_start): cdef int idx = 0 try: @@ -1047,6 +1047,7 @@ cdef void recvGPUDirectMsg(int aid, int ndims, int *arrayIndex, int ep, int numD # Add the buffer's address to the list devBufInfo[idx] = devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx) recv_buffer.setMsg(msg, msgSize) + # TODO: Can this be the same for array and groups? charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start) except: From aae2ff16ce562a85ce02af2fc8ab50bf6a914bb6 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Tue, 18 May 2021 11:26:08 -0400 Subject: [PATCH 100/107] let CkArraySendWithDeviceDataFromPointers determine number of buffers --- charm4py/chare.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/charm4py/chare.py b/charm4py/chare.py index 35d53bd8..49f15f22 100644 --- a/charm4py/chare.py +++ b/charm4py/chare.py @@ -768,9 +768,10 @@ def proxy_entry_method(proxy, *args, **kwargs): charm.CkArraySendWithDeviceDataFromPointers(aid, elemIdx, ep, msg, kwargs['gpu_src_ptrs'], kwargs['gpu_src_sizes'], - stream_ptrs, - len(kwargs['gpu_src_ptrs']) - ) + stream_ptrs + ) + + else: charm.CkArraySend(aid, elemIdx, ep, msg) From abf27e35febf859ae118a119bc85b3df308361d8 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Tue, 18 May 2021 11:27:04 -0400 Subject: [PATCH 101/107] add general slower case for getting gpu pointers/sizes from memoryviews --- charm4py/charm.py | 25 ++++++++++++++++- charm4py/charmlib/charmlib_cython.pyx | 40 ++++++++++++++++++++++++--- 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/charm4py/charm.py b/charm4py/charm.py index afc67a34..64085ef3 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -133,7 +133,8 @@ def __init__(self): self.CkGroupSend = self.lib.CkGroupSend self.CkArraySend = self.lib.CkArraySend self.CkArraySendWithDeviceData = self.lib.CkArraySendWithDeviceData - self.CkArraySendWithDeviceDataFromPointers = self.lib.CkArraySendWithDeviceDataFromPointers + self.CkArraySendWithDeviceDataFromPointersArray = self.lib.CkArraySendWithDeviceDataFromPointersArray + self.CkArraySendWithDeviceDataFromPointersOther = self.lib.CkArraySendWithDeviceDataFromPointersOther self.CkCudaEnabled = self.lib.CkCudaEnabled self.reducers = reduction.ReducerContainer(self) self.redMgr = reduction.ReductionManager(self, self.reducers) @@ -374,6 +375,28 @@ def getGPUDirectDataFromAddresses(self, post_buf_ptrs, post_buf_sizes, remote_bu self.lib.getGPUDirectDataFromAddresses(post_buf_ptrs, post_buf_sizes, remote_bufs, stream_ptrs, return_fut) return return_fut + def CkArraySendWithDeviceDataFromPointers(self, array_id, index, ep, + msg, gpu_src_ptrs, + gpu_src_sizes, + stream_ptrs + ): + if isinstance(gpu_src_ptrs, array.array): + assert isinstance(gpu_src_sizes, array.array), \ + "GPU source pointers and sizes must be of the same type." + self.CkArraySendWithDeviceDataFromPointersArray(array_id, index, ep, + msg, gpu_src_ptrs, + gpu_src_sizes, + stream_ptrs, + len(gpu_src_ptrs) + ) + else: + self.CkArraySendWithDeviceDataFromPointersOther(array_id, index, ep, + msg, gpu_src_ptrs, + gpu_src_sizes, + stream_ptrs, + len(gpu_src_ptrs) + ) + # deposit value of one of the futures that was created on this PE def _future_deposit_result(self, fid, result=None): self.threadMgr.depositFuture(fid, result) diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index a07a14b2..1f90703d 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -497,10 +497,10 @@ class CharmLib(object): cur_buf = 1 gpu_direct_buf_idx = 0 - def CkArraySendWithDeviceDataFromPointers(self, int array_id, index not None, int ep, - msg not None, array.array gpu_src_ptrs, - array.array gpu_src_sizes, - stream_ptrs, int num_bufs): + def CkArraySendWithDeviceDataFromPointersArray(self, int array_id, index not None, int ep, + msg not None, array.array gpu_src_ptrs, + array.array gpu_src_sizes, + stream_ptrs, int num_bufs): cdef int i = 0 cdef int ndims = len(index) @@ -527,6 +527,38 @@ class CharmLib(object): cur_buf = 1 gpu_direct_buf_idx = 0 + def CkArraySendWithDeviceDataFromPointersOther(self, int array_id, index not None, int ep, + msg not None, gpu_src_ptrs, + gpu_src_sizes, + stream_ptrs, int num_bufs + ): + cdef int i = 0 + cdef int ndims = len(index) + global cur_buf + for i in range(ndims): c_index[i] = index[i] + msg0, dcopy = msg + dcopy = None + cdef unsigned long[:] gpu_addresses = gpu_src_ptrs + cdef int[:] gpu_buffer_sizes = gpu_src_sizes + + if stream_ptrs and isinstance(stream_ptrs, list): + for i in range(num_bufs): + gpu_direct_stream_ptrs[i] = stream_ptrs[i] + else: + memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs) + + send_bufs[0] = msg0 + send_buf_sizes[0] = len(msg0) + CkArrayExtSendWithDeviceData(array_id, c_index, ndims, ep, + cur_buf, send_bufs, send_buf_sizes, + &gpu_addresses[0], + &gpu_buffer_sizes[0], + gpu_direct_stream_ptrs, + num_bufs + ) + cur_buf = 1 + gpu_direct_buf_idx = 0 + def CkCudaEnabled(self): return bool(CkCudaEnabled()) From 8188d2955f93505817cc7f3162bb6cdc608b7a35 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 19 May 2021 15:13:49 -0400 Subject: [PATCH 102/107] use Charm++ functionality for GPU-direct group support --- charm4py/charmlib/ccharm.pxd | 6 ++ charm4py/charmlib/charmlib_cython.pyx | 108 +++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd index 902d2127..2de91517 100644 --- a/charm4py/charmlib/ccharm.pxd +++ b/charm4py/charmlib/ccharm.pxd @@ -53,6 +53,7 @@ cdef extern from "charm.h": void registerReadOnlyRecvExtCallback(void (*cb)(int, char*)); void registerChareMsgRecvExtCallback(void (*cb)(int, void*, int, int, char*, int)); void registerGroupMsgRecvExtCallback(void (*cb)(int, int, int, char *, int)); + void registerGroupMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int, int *, void *, int, char *, int)); void registerArrayMsgRecvExtCallback(void (*cb)(int, int, int *, int, int, char *, int)); void registerArrayBcastRecvExtCallback(void (*cb)(int, int, int, int, int *, int, int, char *, int)); void registerArrayElemLeaveExtCallback(int (*cb)(int, int, int *, char**, int)); @@ -82,6 +83,11 @@ cdef extern from "charm.h": int *devBufSizesInBytes, long *streamPtrs, int numDevBufs ); + void CkGroupExtSendWithDeviceData(int gid, int pe, int epIdx, int num_bufs, char **bufs, + int *buf_sizes, long *devBufPtrs, + int *devBufSizesInBytes, + long *streamPtrs, int numDevBufs + ); void registerArrayMsgGPUDirectRecvExtCallback(void (*cb)(int, int, int*, int, int, int*, void *, int, char*, int)); diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx index 1f90703d..0183ea10 100644 --- a/charm4py/charmlib/charmlib_cython.pyx +++ b/charm4py/charmlib/charmlib_cython.pyx @@ -462,6 +462,93 @@ class CharmLib(object): CkGroupExtSend_multi(group_id, num_pes, section_children, ep, cur_buf, send_bufs, send_buf_sizes) cur_buf = 1 + def CkGroupSendWithDeviceData(self, int group_id, int index, int ep, + msg not None, stream_ptrs): + global gpu_direct_buf_idx + cdef int i = 0 + msg0, dcopy = msg + dcopy = None + cdef int num_direct_buffers = gpu_direct_buf_idx + # TODO: Message on assertion failure + assert num_direct_buffers <= NUM_DCOPY_BUFS + global gpu_direct_device_ptrs + global gpu_direct_stream_ptrs + global cur_buf + + if stream_ptrs and isinstance(stream_ptrs, list): + for i in range(num_direct_buffers): + gpu_direct_stream_ptrs[i] = stream_ptrs[i] + elif not stream_ptrs: + memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_direct_buffers) + + send_bufs[0] = msg0 + send_buf_sizes[0] = len(msg0) + CkGroupExtSendWithDeviceData(group_id, index, ep, + cur_buf, send_bufs, send_buf_sizes, + gpu_direct_device_ptrs, + gpu_direct_buff_sizes, + gpu_direct_stream_ptrs, + num_direct_buffers + ) + cur_buf = 1 + gpu_direct_buf_idx = 0 + + def CkGroupSendWithDeviceDataFromPointersArray(self, int gid, int index, int ep, + msg not None, array.array gpu_src_ptrs, + array.array gpu_src_sizes, stream_ptrs, + num_bufs): + cdef int i = 0 + global cur_buf + msg0, dcopy = msg + dcopy = None + + if stream_ptrs and isinstance(stream_ptrs, list): + for i in range(num_bufs): + gpu_direct_stream_ptrs[i] = stream_ptrs[i] + else: + memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs) + + send_bufs[0] = msg0 + send_buf_sizes[0] = len(msg0) + CkGroupExtSendWithDeviceData(gid, index, ep, + cur_buf, send_bufs, send_buf_sizes, + gpu_src_ptrs.data.as_voidptr, + gpu_src_sizes.data.as_voidptr, + gpu_direct_stream_ptrs, + num_bufs + ) + cur_buf = 1 + gpu_direct_buf_idx = 0 + + def CkGroupSendWithDeviceDataFromPointersOther(self, int gid, int index, int ep, + msg not None, gpu_src_ptrs, + gpu_src_sizes, stream_ptrs, + num_bufs): + cdef int i = 0 + global cur_buf + msg0, dcopy = msg + dcopy = None + cdef unsigned long[:] gpu_addresses = gpu_src_ptrs + cdef int[:] gpu_buffer_sizes = gpu_src_sizes + + if stream_ptrs and isinstance(stream_ptrs, list): + for i in range(num_bufs): + gpu_direct_stream_ptrs[i] = stream_ptrs[i] + else: + memset(gpu_direct_stream_ptrs, 0, sizeof(long) * num_bufs) + + send_bufs[0] = msg0 + send_buf_sizes[0] = len(msg0) + CkGroupExtSendWithDeviceData(gid, index, ep, + cur_buf, send_bufs, send_buf_sizes, + &gpu_addresses[0], + &gpu_buffer_sizes[0], + gpu_direct_stream_ptrs, + num_bufs + ) + cur_buf = 1 + gpu_direct_buf_idx = 0 + def CkArraySendWithDeviceData(self, int array_id, index not None, int ep, msg not None, stream_ptrs): @@ -818,6 +905,7 @@ class CharmLib(object): registerReadOnlyRecvExtCallback(recvReadOnly) registerChareMsgRecvExtCallback(recvChareMsg) registerGroupMsgRecvExtCallback(recvGroupMsg) + registerGroupMsgGPUDirectRecvExtCallback(recvGPUDirectGroupMsg) registerArrayMsgRecvExtCallback(recvArrayMsg) registerArrayMsgGPUDirectRecvExtCallback(recvGPUDirectArrayMsg) registerArrayBcastRecvExtCallback(recvArrayBcast) @@ -1055,6 +1143,24 @@ cdef void recvGroupMsg(int gid, int ep, int msgSize, char *msg, int dcopy_start) except: charm.handleGeneralError() +cdef void recvGPUDirectGroupMsg(int gid, int ep, int numDevBuffs, + int *devBufSizes, void *devBufs, int msgSize, + char *msg, int dcopy_start + ): + try: + if PROFILING: + charm._precvtime = time.time() + charm.recordReceive(msgSize) + devBufInfo = array.array('L', [0] * numDevBuffs) + for idx in range(numDevBuffs): + # Add the buffer's address to the list + devBufInfo[idx] = devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx) + recv_buffer.setMsg(msg, msgSize) + charm.recvGPUDirectGroupMsg(gid, ep, devBufInfo, recv_buffer, dcopy_start) + except: + charm.handleGeneralError() + + cdef void recvArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int msgSize, char *msg, int dcopy_start): try: if PROFILING: @@ -1080,7 +1186,7 @@ cdef void recvGPUDirectArrayMsg(int aid, int ndims, int *arrayIndex, int ep, int devBufInfo[idx] = devBufs+(CK_DEVICEBUFFER_SIZE_IN_BYTES*idx) recv_buffer.setMsg(msg, msgSize) # TODO: Can this be the same for array and groups? - charm.recvGPUDirectMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start) + charm.recvGPUDirectArrayMsg(aid, array_index_to_tuple(ndims, arrayIndex), ep, devBufInfo, recv_buffer, dcopy_start) except: charm.handleGeneralError() From 895e619f05dba50641741cc3e6b1962357f4eff0 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 19 May 2021 15:14:21 -0400 Subject: [PATCH 103/107] Groups can now use GPU-direct functionality --- charm4py/chare.py | 27 +++++++++++++++++++++++++-- charm4py/charm.py | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/charm4py/chare.py b/charm4py/chare.py index 49f15f22..74aff874 100644 --- a/charm4py/chare.py +++ b/charm4py/chare.py @@ -461,6 +461,9 @@ def proxy_entry_method(proxy, *args, **kwargs): for i in range(num_args, argcount): argname = argnames[i] # first look for argument in kwargs + # TODO: Should stream_ptrs be skipped? + if argname in {'stream_ptrs', 'gpu_src_ptrs', 'gpu_src_sizes'}: + continue if argname in kwargs: args.append(kwargs[argname]) else: @@ -485,8 +488,28 @@ def proxy_entry_method(proxy, *args, **kwargs): gid = proxy.gid if Options.local_msg_optim and (elemIdx == charm._myPe) and (len(args) > 0): destObj = charm.groups[gid] - msg = charm.packMsg(destObj, args, header) - charm.CkGroupSend(gid, elemIdx, ep, msg) + should_pack_gpu = True + if 'gpu_src_ptrs' in kwargs: + should_pack_gpu = False + msg = charm.packMsg(destObj, args, header, pack_gpu=should_pack_gpu) + if msg[1] or not should_pack_gpu: + if 'stream_ptrs' in kwargs: + stream_ptrs = kwargs['stream_ptrs'] + else: + stream_ptrs = None + if should_pack_gpu: + charm.CkGroupSendWithDeviceData(gid, elemIdx, ep, + msg, stream_ptrs + ) + else: + charm.CkGroupSendWithDeviceDataFromPointers(gid, elemIdx, ep, + msg, kwargs['gpu_src_ptrs'], + kwargs['gpu_src_sizes'], + stream_ptrs + ) + + else: + charm.CkGroupSend(gid, elemIdx, ep, msg) else: root, sid = proxy.section header[b'sid'] = sid diff --git a/charm4py/charm.py b/charm4py/charm.py index 64085ef3..4e626a9e 100644 --- a/charm4py/charm.py +++ b/charm4py/charm.py @@ -133,8 +133,11 @@ def __init__(self): self.CkGroupSend = self.lib.CkGroupSend self.CkArraySend = self.lib.CkArraySend self.CkArraySendWithDeviceData = self.lib.CkArraySendWithDeviceData + self.CkGroupSendWithDeviceData = self.lib.CkGroupSendWithDeviceData self.CkArraySendWithDeviceDataFromPointersArray = self.lib.CkArraySendWithDeviceDataFromPointersArray self.CkArraySendWithDeviceDataFromPointersOther = self.lib.CkArraySendWithDeviceDataFromPointersOther + self.CkGroupSendWithDeviceDataFromPointersArray = self.lib.CkGroupSendWithDeviceDataFromPointersArray + self.CkGroupSendWithDeviceDataFromPointersOther = self.lib.CkGroupSendWithDeviceDataFromPointersOther self.CkCudaEnabled = self.lib.CkCudaEnabled self.reducers = reduction.ReducerContainer(self) self.redMgr = reduction.ReductionManager(self, self.reducers) @@ -321,15 +324,21 @@ def recvArrayMsg(self, aid, index, ep, msg, dcopy_start): self.arrays[aid][index] = obj em.run(obj, header, args) # now call the user's array element __init__ - def recvGPUDirectMsg(self, aid, index, ep, - devBuf_ptrs, msg, dcopy_start - ): + def recvGPUDirectArrayMsg(self, aid, index, ep, + devBuf_ptrs, msg, dcopy_start + ): obj = self.arrays[aid][index] header, args = self.unpackMsg(msg, dcopy_start, obj) args.append(devBuf_ptrs) self.invokeEntryMethod(obj, ep, header, args) + def recvGPUDirectGroupMsg(self, gid, ep, devBuf_ptrs, msg, dcopy_start): + obj = self.groups[gid] + header, args = self.unpackMsg(msg, dcopy_start, obj) + args.append(devBuf_ptrs) + self.invokeEntryMethod(obj, ep, header, args) + def recvArrayBcast(self, aid, indexes, ep, msg, dcopy_start): header, args = self.unpackMsg(msg, dcopy_start, None) array = self.arrays[aid] @@ -397,6 +406,26 @@ def CkArraySendWithDeviceDataFromPointers(self, array_id, index, ep, len(gpu_src_ptrs) ) + def CkGroupSendWithDeviceDataFromPointers(self, gid, elemIdx, ep, + msg, gpu_src_ptrs, gpu_src_sizes, + stream_ptrs): + if isinstance(gpu_src_ptrs, array.array): + assert isinstance(gpu_src_sizes, array.array), \ + "GPU source pointers and sizes must be of the same type." + self.CkGroupSendWithDeviceDataFromPointersArray(gid, elemIdx, ep, msg, + gpu_src_ptrs, + gpu_src_sizes, + stream_ptrs, + len(gpu_src_ptrs) + ) + else: + self.CkGroupSendWithDeviceDataFromPointersOther(gid, elemIdx, ep, msg, + gpu_src_ptrs, + gpu_src_sizes, + stream_ptrs, + len(gpu_src_ptrs) + ) + # deposit value of one of the futures that was created on this PE def _future_deposit_result(self, fid, result=None): self.threadMgr.depositFuture(fid, result) From 0abd4437b68be38dd4fe521fedb5930d9a1b9104 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Wed, 19 May 2021 15:14:36 -0400 Subject: [PATCH 104/107] test both arrays and groups --- tests/cuda/multi_array.py | 17 +++++++++++++++-- tests/cuda/single_array.py | 18 +++++++++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/tests/cuda/multi_array.py b/tests/cuda/multi_array.py index 4dc26c74..cb9d038d 100644 --- a/tests/cuda/multi_array.py +++ b/tests/cuda/multi_array.py @@ -7,10 +7,14 @@ class A(Chare): def __init__(self, msg_size): self.msg_size = msg_size + if type(self.thisIndex) is tuple: + self.idx = int(self.thisIndex[0]) + else: + self.idx = self.thisIndex @coro def run(self, done_future, addr_optimization=False): - partner = self.thisProxy[int(not self.thisIndex[0])] + partner = self.thisProxy[int(not self.idx)] partner_channel = Channel(self, partner) device_data = cuda.device_array(self.msg_size, dtype='int8') @@ -28,7 +32,7 @@ def run(self, done_future, addr_optimization=False): host_array = np.array(self.msg_size, dtype='int32') host_array.fill(42) - if self.thisIndex[0]: + if self.idx: h1 = np.ones(self.msg_size, dtype='int8') h2 = np.zeros(self.msg_size, dtype='int8') device_data.copy_to_device(h1) @@ -78,6 +82,15 @@ def main(args): chares.run(done_fut, addr_optimization=False) done_fut.get() + done_fut = Future() + chares.run(done_fut, addr_optimization=True) + done_fut.get() + + chares = Group(A, args=[(1<<30)]) + done_fut = Future() + chares.run(done_fut, addr_optimization=False) + done_fut.get() + done_fut = Future() chares.run(done_fut, addr_optimization=True) done_fut.get() diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py index e53276f0..61836018 100644 --- a/tests/cuda/single_array.py +++ b/tests/cuda/single_array.py @@ -7,10 +7,13 @@ class A(Chare): def __init__(self, msg_size): self.msg_size = msg_size - + if type(self.thisIndex) is tuple: + self.idx = int(self.thisIndex[0]) + else: + self.idx = self.thisIndex @coro def run(self, done_future, addr_optimization=False): - partner = self.thisProxy[int(not self.thisIndex[0])] + partner = self.thisProxy[int(not self.idx)] partner_channel = Channel(self, partner) device_data = cuda.device_array(self.msg_size, dtype='int8') @@ -24,7 +27,7 @@ def run(self, done_future, addr_optimization=False): my_stream = cuda.stream() stream_addr = array.array('L', [my_stream.handle.value]) - if self.thisIndex[0]: + if self.idx: host_data = np.zeros(self.msg_size, dtype='int8') host_data.fill(5) device_data.copy_to_device(host_data) @@ -118,6 +121,15 @@ def main(args): chares.run(done_fut, addr_optimization=False) done_fut.get() + done_fut = Future() + chares.run(done_fut, addr_optimization=True) + done_fut.get() + + chares = Group(A, args=[1<<20]) + done_fut = Future() + chares.run(done_fut, addr_optimization=False) + done_fut.get() + done_fut = Future() chares.run(done_fut, addr_optimization=True) done_fut.get() From 53d35f293de9db1b9be8bd63d63a6c6e9a1b9466 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 20 May 2021 18:12:38 -0400 Subject: [PATCH 105/107] make post/src buffers more kwargs more general --- charm4py/channel.py | 12 +++++++----- charm4py/chare.py | 16 ++++++++-------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/charm4py/channel.py b/charm4py/channel.py index 15a407d1..c5c5eed7 100644 --- a/charm4py/channel.py +++ b/charm4py/channel.py @@ -62,7 +62,7 @@ def send(self, *msg, **kwargs): self.remote._channelRecv__(self.remote_port, self.send_seqno, *msg, **kwargs) self.send_seqno = (self.send_seqno + 1) % CHAN_BUF_SIZE - def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None, stream_ptrs = None): + def recv(self, *post_buffers, post_addresses = None, post_sizes = None, stream_ptrs = None): if self.recv_seqno in self.data: ret = self.data.pop(self.recv_seqno) else: @@ -81,9 +81,10 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None, gpu_recv_bufs = ret assert len(post_buffers) == len(gpu_recv_bufs) + recv_future = charm.getGPUDirectData(post_buffers, gpu_recv_bufs, stream_ptrs) recv_future.get() - elif post_buf_addresses is not None: + elif post_addresses is not None: if isinstance(ret, tuple): gpu_recv_bufs = ret[-1] ret = ret[0:-1] @@ -91,9 +92,10 @@ def recv(self, *post_buffers, post_buf_addresses = None, post_buf_sizes = None, ret = ret[0] else: gpu_recv_bufs = ret - assert len(post_buf_addresses) == len(gpu_recv_bufs) - assert post_buf_sizes - recv_future = charm.getGPUDirectDataFromAddresses(post_buf_addresses, post_buf_sizes, gpu_recv_bufs, stream_ptrs) + + assert len(post_addresses) == len(gpu_recv_bufs) + assert post_sizes + recv_future = charm.getGPUDirectDataFromAddresses(post_addresses, post_sizes, gpu_recv_bufs, stream_ptrs) recv_future.get() diff --git a/charm4py/chare.py b/charm4py/chare.py index 74aff874..4bfa6245 100644 --- a/charm4py/chare.py +++ b/charm4py/chare.py @@ -462,7 +462,7 @@ def proxy_entry_method(proxy, *args, **kwargs): argname = argnames[i] # first look for argument in kwargs # TODO: Should stream_ptrs be skipped? - if argname in {'stream_ptrs', 'gpu_src_ptrs', 'gpu_src_sizes'}: + if argname in {'stream_ptrs', 'src_ptrs', 'src_sizes'}: continue if argname in kwargs: args.append(kwargs[argname]) @@ -489,7 +489,7 @@ def proxy_entry_method(proxy, *args, **kwargs): if Options.local_msg_optim and (elemIdx == charm._myPe) and (len(args) > 0): destObj = charm.groups[gid] should_pack_gpu = True - if 'gpu_src_ptrs' in kwargs: + if 'src_ptrs' in kwargs: should_pack_gpu = False msg = charm.packMsg(destObj, args, header, pack_gpu=should_pack_gpu) if msg[1] or not should_pack_gpu: @@ -503,8 +503,8 @@ def proxy_entry_method(proxy, *args, **kwargs): ) else: charm.CkGroupSendWithDeviceDataFromPointers(gid, elemIdx, ep, - msg, kwargs['gpu_src_ptrs'], - kwargs['gpu_src_sizes'], + msg, kwargs['src_ptrs'], + kwargs['src_sizes'], stream_ptrs ) @@ -745,7 +745,7 @@ def proxy_entry_method(proxy, *args, **kwargs): argname = argnames[i] # first look for argument in kwargs # TODO: Should stream_ptrs be skipped? - if argname in {'stream_ptrs', 'gpu_src_ptrs', 'gpu_src_sizes'}: + if argname in {'stream_ptrs', 'src_ptrs', 'src_sizes'}: continue if argname in kwargs and argname: args.append(kwargs[argname]) @@ -775,7 +775,7 @@ def proxy_entry_method(proxy, *args, **kwargs): if elemIdx in array: destObj = array[elemIdx] should_pack_gpu = True - if 'gpu_src_ptrs' in kwargs: + if 'src_ptrs' in kwargs: should_pack_gpu = False msg = charm.packMsg(destObj, args, header, pack_gpu = should_pack_gpu) if msg[1] or not should_pack_gpu: @@ -789,8 +789,8 @@ def proxy_entry_method(proxy, *args, **kwargs): ) else: charm.CkArraySendWithDeviceDataFromPointers(aid, elemIdx, ep, - msg, kwargs['gpu_src_ptrs'], - kwargs['gpu_src_sizes'], + msg, kwargs['src_ptrs'], + kwargs['src_sizes'], stream_ptrs ) From 52856ad2fdc952a9fa3b3e31490920b60d552d34 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 20 May 2021 18:13:02 -0400 Subject: [PATCH 106/107] update tests with new kwarg names --- tests/cuda/multi_array.py | 8 ++++---- tests/cuda/single_array.py | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/cuda/multi_array.py b/tests/cuda/multi_array.py index cb9d038d..93c7876a 100644 --- a/tests/cuda/multi_array.py +++ b/tests/cuda/multi_array.py @@ -38,16 +38,16 @@ def run(self, done_future, addr_optimization=False): device_data.copy_to_device(h1) device_data2.copy_to_device(h2) if addr_optimization: - partner_channel.send(20, host_array, gpu_src_ptrs=d_addr, - gpu_src_sizes=d_size + partner_channel.send(20, host_array, src_ptrs=d_addr, + src_sizes=d_size ) partner_channel.recv() else: partner_channel.send(20, host_array, device_data, device_data2) else: if addr_optimization: - f, g = partner_channel.recv(post_buf_addresses=d_addr, - post_buf_sizes=d_size + f, g = partner_channel.recv(post_addresses=d_addr, + post_sizes=d_size ) else: f, g = partner_channel.recv(device_data, device_data2) diff --git a/tests/cuda/single_array.py b/tests/cuda/single_array.py index 61836018..1c3a3692 100644 --- a/tests/cuda/single_array.py +++ b/tests/cuda/single_array.py @@ -34,11 +34,11 @@ def run(self, done_future, addr_optimization=False): if addr_optimization: partner_channel.send(1, 2, "hello", np.ones(self.msg_size, dtype='int8'), - gpu_src_ptrs=d_addr, gpu_src_sizes=d_size, + src_ptrs=d_addr, src_sizes=d_size, stream_ptrs=stream_addr ) - p_data = partner_channel.recv(post_buf_addresses=d_addr, - post_buf_sizes=d_size, + p_data = partner_channel.recv(post_addresses=d_addr, + post_sizes=d_size, stream_ptrs=stream_addr ) else: @@ -54,9 +54,9 @@ def run(self, done_future, addr_optimization=False): assert np.array_equal(h_ary, host_data) if addr_optimization: - partner_channel.send(gpu_src_ptrs=d_addr, gpu_src_sizes=d_size) - partner_channel.recv(post_buf_addresses=d_addr, - post_buf_sizes=d_size + partner_channel.send(src_ptrs=d_addr, src_sizes=d_size) + partner_channel.recv(post_addresses=d_addr, + post_sizes=d_size ) else: partner_channel.send(device_data) @@ -66,8 +66,8 @@ def run(self, done_future, addr_optimization=False): assert np.array_equal(h_ary, host_data) else: if addr_optimization: - p_data = partner_channel.recv(post_buf_addresses=d_addr, - post_buf_sizes=d_size + p_data = partner_channel.recv(post_addresses=d_addr, + post_sizes=d_size ) else: p_data = partner_channel.recv(device_data) @@ -83,17 +83,17 @@ def run(self, done_future, addr_optimization=False): assert p_data == (1, 2, "hello") if addr_optimization: - partner_channel.send(2, 3, gpu_src_ptrs=d_addr, - gpu_src_sizes=d_size + partner_channel.send(2, 3, src_ptrs=d_addr, + src_sizes=d_size ) else: partner_channel.send(2, 3, device_data) if addr_optimization: - partner_channel.recv(post_buf_addresses=d_addr, - post_buf_sizes=d_size + partner_channel.recv(post_addresses=d_addr, + post_sizes=d_size ) - partner_channel.send(gpu_src_ptrs=d_addr, gpu_src_sizes=d_size) + partner_channel.send(src_ptrs=d_addr, src_sizes=d_size) else: partner_channel.recv(device_data) partner_channel.send(device_data) From ba3e95ce49a42d2226c6e4e74098f99109993568 Mon Sep 17 00:00:00 2001 From: Zane Fink Date: Thu, 20 May 2021 18:30:38 -0400 Subject: [PATCH 107/107] update calls to match new API --- tests/benchmark/pingpong_gpu.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/benchmark/pingpong_gpu.py b/tests/benchmark/pingpong_gpu.py index 5bfafd6e..4a771794 100644 --- a/tests/benchmark/pingpong_gpu.py +++ b/tests/benchmark/pingpong_gpu.py @@ -58,9 +58,9 @@ def do_iteration(self, message_size, num_iters, done_future): if self.gpu_direct and USE_ADDRESS_OPTIMIZATION: d_data_recv_addr = array.array('L', [0]) - d_data_recv_size = array.array('L', [0]) + d_data_recv_size = array.array('i', [0]) d_data_send_addr = array.array('L', [0]) - d_data_send_size = array.array('L', [0]) + d_data_send_size = array.array('i', [0]) d_data_recv_addr[0] = d_data_recv.__cuda_array_interface__['data'][0] d_data_recv_size[0] = d_data_recv.nbytes @@ -85,9 +85,9 @@ def do_iteration(self, message_size, num_iters, done_future): charm.lib.CudaStreamSynchronize(stream_address) else: if USE_ADDRESS_OPTIMIZATION: - partner_channel.send(gpu_src_ptrs = d_data_send_addr, gpu_src_sizes = d_data_send_size) - partner_channel.recv(post_buf_addresses = d_data_recv_addr, - post_buf_sizes = d_data_recv_size + partner_channel.send(src_ptrs = d_data_send_addr, src_sizes = d_data_send_size) + partner_channel.recv(post_addresses = d_data_recv_addr, + post_sizes = d_data_recv_size ) else: partner_channel.send(d_data_send) @@ -103,10 +103,10 @@ def do_iteration(self, message_size, num_iters, done_future): partner_channel.send(h_data_send) else: if USE_ADDRESS_OPTIMIZATION: - partner_channel.recv(post_buf_addresses = d_data_recv_addr, - post_buf_sizes = d_data_recv_size + partner_channel.recv(post_addresses = d_data_recv_addr, + post_sizes = d_data_recv_size ) - partner_channel.send(gpu_src_ptrs = d_data_send_addr, gpu_src_sizes = d_data_send_size) + partner_channel.send(src_ptrs = d_data_send_addr, src_sizes = d_data_send_size) else: partner_channel.recv(d_data_recv) partner_channel.send(d_data_send)